Skip to content

Commit 04cdb28

Browse files
format black
1 parent eeeb21d commit 04cdb28

18 files changed

+1510
-1318
lines changed

vtools/data/gap.py

Lines changed: 76 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -6,42 +6,44 @@
66
import pandas as pd
77
import numpy as np
88

9-
__all__ = ['gap_count','gap_size','gap_distance']
9+
__all__ = ["gap_count", "gap_size", "gap_distance"]
1010

11-
def gap_count(ts,state="gap",dtype=int):
12-
""" Count missing data
11+
12+
def gap_count(ts, state="gap", dtype=int):
13+
"""Count missing data
1314
Identifies gaps (runs of missing or non-missing data) and quantifies the
1415
length of the gap in terms of number of samples, which works better for
15-
regular series. Each time point receives the length of the run.
16-
16+
regular series. Each time point receives the length of the run.
17+
1718
Parameters
1819
----------
19-
20+
2021
ts : :class:`DataFrame <pandas:pandas.DataFrame>`
2122
Time series to analyze
22-
23+
2324
state : `str` one of 'gap'|'good'|'both'
24-
State to count. If state is gap, block size of missing data are counted
25+
State to count. If state is gap, block size of missing data are counted
2526
and reported for time points in the gap (every point in a given gap will
26-
receive the same value). Non missing data will have a size of zero.
27+
receive the same value). Non missing data will have a size of zero.
2728
Setting state to 'good' inverts this -- missing blocks are reported as
28-
zero and good data are counted.
29-
29+
zero and good data are counted.
30+
3031
dtype : `str` or `type`
3132
Data type of output, should be acceptable to
3233
pandas :meth:`astype <pandas:pandas.DataFrame.astype>`
33-
34-
"""
34+
35+
"""
36+
3537
def column_gap_count(ser):
3638
s = ser.index.to_series()
3739
tsout = ser.fillna(0).astype(dtype)
3840
miss = ser.isna()
39-
#create consecutive groups that increment each time the "is missing state" (na or not na) changes
41+
# create consecutive groups that increment each time the "is missing state" (na or not na) changes
4042
g = miss.ne(miss.shift()).cumsum()
4143

4244
# identify beginning (min time) of each state
4345
count = s.groupby(g).count()
44-
46+
4547
# g contains a group index for each member of out, and here
4648
# we map g to out which has cumulative time
4749
tsout = g.map(count)
@@ -50,15 +52,13 @@ def column_gap_count(ser):
5052
elif state == "good":
5153
tsout.loc[miss] = 0
5254
return tsout
53-
54-
if hasattr(ts,"columns"):
55-
return ts.apply(column_gap_count,axis=0,result_type='broadcast').astype(dtype)
55+
56+
if hasattr(ts, "columns"):
57+
return ts.apply(column_gap_count, axis=0, result_type="broadcast").astype(dtype)
5658
else:
5759
return column_gap_count(ts).astype(dtype)
5860

5961

60-
61-
6262
def gap_size(ts):
6363
"""
6464
Identifies gaps (runs of missing data) and quantifies the
@@ -67,18 +67,18 @@ def gap_size(ts):
6767
with non-missing data returning zero. Time is measured from the time the
6868
data first started being missing to when the data first starts being not missing
6969
.
70-
70+
7171
Parameters
7272
----------
73-
73+
7474
ts : :class:`DataFrame <pandas:pandas.DataFrame>`
75-
75+
7676
Returns
7777
-------
7878
result : :class:`DataFrame <pandas:pandas.DataFrame>`
7979
A new regular time series with the same freq as the argument
80-
holding the size of the gap.
81-
80+
holding the size of the gap.
81+
8282
Examples
8383
--------
8484
>>> ndx = pd.date_range(pd.Timestamp(2017,1,1,12),freq='15min',periods=10)
@@ -87,7 +87,7 @@ def gap_size(ts):
8787
>>> vals2 = np.arange(0.,10.,dtype='d')
8888
>>> vals0[0:3] = np.nan
8989
>>> vals0[7:-1] = np.nan
90-
>>> vals1[2:4] = np.nan>>>
90+
>>> vals1[2:4] = np.nan>>>
9191
>>> vals1[6] = np.nan
9292
>>> vals1[9] = np.nan
9393
@@ -105,7 +105,7 @@ def gap_size(ts):
105105
2017-01-01 13:45:00 NaN 7.0 7.0
106106
2017-01-01 14:00:00 NaN 8.0 8.0
107107
2017-01-01 14:15:00 9.0 NaN 9.0
108-
>>> print(out)
108+
>>> print(out)
109109
vals0 vals1 vals2
110110
2017-01-01 12:00:00 45.0 0.0 0.0
111111
2017-01-01 12:15:00 45.0 0.0 0.0
@@ -116,96 +116,93 @@ def gap_size(ts):
116116
2017-01-01 13:30:00 0.0 15.0 0.0
117117
2017-01-01 13:45:00 30.0 0.0 0.0
118118
2017-01-01 14:00:00 30.0 0.0 0.0
119-
2017-01-01 14:15:00 0.0 0.0 0.0
120-
119+
2017-01-01 14:15:00 0.0 0.0 0.0
120+
121121
"""
122-
123-
ts_out = ts*0.
124-
122+
123+
ts_out = ts * 0.0
124+
125125
s = ts.index.to_series()
126126
for c in ts.columns:
127-
#test missing values
127+
# test missing values
128128
miss = ts[c].isna()
129-
#create consecutive groups that increment each time the "is missing state" (na or not na) changes
129+
# create consecutive groups that increment each time the "is missing state" (na or not na) changes
130130
g = miss.ne(miss.shift()).cumsum()
131131
# identify beginning (min time) of each state
132132
m1 = s.groupby(g).min()
133-
134-
#get beginning of next groups, last value is replaced last value of index
133+
134+
# get beginning of next groups, last value is replaced last value of index
135135
m2 = m1.shift(-1).fillna(ts.index[-1])
136136

137-
#get difference, convert to minutes
137+
# get difference, convert to minutes
138138
diffs = m2.sub(m1).dt.total_seconds().div(60).astype(int)
139-
139+
140140
# g contains a group index for each member of out, and here
141141
# we map g to out which has cumulative time
142142
ts_out[c] = g.map(diffs)
143-
ts_out.loc[~miss,c] = 0.
143+
ts_out.loc[~miss, c] = 0.0
144144
return ts_out
145145

146146

147-
148-
def gap_distance(ts, disttype="count", to = "good"):
149-
147+
def gap_distance(ts, disttype="count", to="good"):
150148
"""
151149
For each element of ts, count the distance to the nearest good data/or bad data.
152-
150+
153151
Parameters
154152
----------
155-
153+
156154
ts : :class:`DataFrame <pandas:pandas.DataFrame>`
157155
Time series to analyze
158-
156+
159157
disttype : `str` one of 'bad'|'good'
160158
If disttype = "count" this is the number of values. If dist_type="freq" it is in the units of ts.freq
161159
(so if freq == "15min" it is in minutes")
162-
160+
163161
to : `str` one of 'bad'|'good'
164-
162+
165163
If to = "good" this is the distance to the nearest good data (which is 0 for good data).
166-
If to = "bad", this is the distance to the nearest nan (which is 0 for nan).
164+
If to = "bad", this is the distance to the nearest nan (which is 0 for nan).
167165
168166
Returns
169167
-------
170168
result : :class:`DataFrame <pandas:pandas.DataFrame>`
171169
A new regular time series with the same freq as the argument
172-
holding the distance of good/bad data.
173-
170+
holding the distance of good/bad data.
171+
174172
"""
175173
si = ts.index.to_series()
176-
ts_out = ts.to_frame() if isinstance(ts,pd.Series) else ts.copy()
174+
ts_out = ts.to_frame() if isinstance(ts, pd.Series) else ts.copy()
177175
cols = ts_out.columns
178176
for col in cols:
179-
id_key=True
180-
#test missing values
177+
id_key = True
178+
# test missing values
181179
miss = ts_out[col].isna()
182-
if to=="good":
183-
ts_out.at[~miss,col]=0
184-
elif to=="bad":
185-
ts_out.at[miss,col]=0
186-
id_key=False
180+
if to == "good":
181+
ts_out.at[~miss, col] = 0
182+
elif to == "bad":
183+
ts_out.at[miss, col] = 0
184+
id_key = False
187185
else:
188186
raise ValueError("invalid input to, must be good or bad")
189187

190-
191-
if np.any(miss==(id_key)):
192-
mm=si.groupby(miss).indices
188+
if np.any(miss == (id_key)):
189+
mm = si.groupby(miss).indices
193190
for i in mm[id_key]:
194-
#ts_out.iloc[i][col]=np.min(np.abs(i-mm[not(id_key)]))
195-
ts_out.at[si[i],col]=np.min(np.abs(i-mm[not(id_key)]))
196-
197-
198-
if disttype=="count":
191+
# ts_out.iloc[i][col]=np.min(np.abs(i-mm[not(id_key)]))
192+
ts_out.at[si[i], col] = np.min(np.abs(i - mm[not (id_key)]))
193+
194+
if disttype == "count":
199195
return ts_out
200-
elif disttype=="freq":
201-
return ts_out*ts.index.freq
196+
elif disttype == "freq":
197+
return ts_out * ts.index.freq
202198
else:
203199
raise ValueError("invalid input disttype, must be count or freq")
204200

205201

206202
import pandas as pd
207203
import numpy as np
208204

205+
209206
def describe_series_gaps(s: pd.Series, name: str, context: int = 2):
210207
"""
211208
Print gaps in a single Series s, showing `context` non-null points
@@ -221,7 +218,7 @@ def describe_series_gaps(s: pd.Series, name: str, context: int = 2):
221218
# find rising edges (gap starts) and falling edges (gap ends)
222219
diffs = np.diff(mask.astype(int), prepend=0, append=0)
223220
starts = np.where(diffs == 1)[0]
224-
ends = np.where(diffs == -1)[0] - 1
221+
ends = np.where(diffs == -1)[0] - 1
225222

226223
for i, (st, en) in enumerate(zip(starts, ends), 1):
227224
gap_len = en - st + 1
@@ -253,6 +250,7 @@ def describe_series_gaps(s: pd.Series, name: str, context: int = 2):
253250
print(f" ← {idx[pi]} : {s.iloc[pi]}")
254251
print()
255252

253+
256254
def describe_null(dset, name, context=2):
257255
"""
258256
If dset is a DataFrame, run describe_series_gaps on each column.
@@ -265,11 +263,11 @@ def describe_null(dset, name, context=2):
265263
describe_series_gaps(dset, name, context=context)
266264

267265

268-
269266
def example_gap():
270267
import numpy as np
271-
ndx = pd.date_range(pd.Timestamp(2017,1,1,12),freq='15min',periods=10)
272-
vals0 = np.arange(0.,10.,dtype='d')
268+
269+
ndx = pd.date_range(pd.Timestamp(2017, 1, 1, 12), freq="15min", periods=10)
270+
vals0 = np.arange(0.0, 10.0, dtype="d")
273271
vals1 = vals0.copy()
274272
vals2 = vals0.copy()
275273
vals0[0:3] = np.nan
@@ -278,15 +276,15 @@ def example_gap():
278276
vals1[6] = np.nan
279277
vals1[9] = np.nan
280278

281-
df = pd.DataFrame({'vals0':vals0,'vals1':vals1,'vals2':vals2},index = ndx)
279+
df = pd.DataFrame({"vals0": vals0, "vals1": vals1, "vals2": vals2}, index=ndx)
282280
out = gap_count(df)
283281
print(df)
284282
print(out)
285-
283+
286284
out = gap_distance(df)
287285
print("**")
288286
print(out)
289-
290-
if __name__=="__main__":
291-
example_gap()
292-
287+
288+
289+
if __name__ == "__main__":
290+
example_gap()

0 commit comments

Comments
 (0)