66import pandas as pd
77import numpy as np
88
9- __all__ = [' gap_count' , ' gap_size' , ' gap_distance' ]
9+ __all__ = [" gap_count" , " gap_size" , " gap_distance" ]
1010
11- def gap_count (ts ,state = "gap" ,dtype = int ):
12- """ Count missing data
11+
12+ def gap_count (ts , state = "gap" , dtype = int ):
13+ """Count missing data
1314 Identifies gaps (runs of missing or non-missing data) and quantifies the
1415 length of the gap in terms of number of samples, which works better for
15- regular series. Each time point receives the length of the run.
16-
16+ regular series. Each time point receives the length of the run.
17+
1718 Parameters
1819 ----------
19-
20+
2021 ts : :class:`DataFrame <pandas:pandas.DataFrame>`
2122 Time series to analyze
22-
23+
2324 state : `str` one of 'gap'|'good'|'both'
24- State to count. If state is gap, block size of missing data are counted
25+ State to count. If state is gap, block size of missing data are counted
2526 and reported for time points in the gap (every point in a given gap will
26- receive the same value). Non missing data will have a size of zero.
27+ receive the same value). Non missing data will have a size of zero.
2728 Setting state to 'good' inverts this -- missing blocks are reported as
28- zero and good data are counted.
29-
29+ zero and good data are counted.
30+
3031 dtype : `str` or `type`
3132 Data type of output, should be acceptable to
3233 pandas :meth:`astype <pandas:pandas.DataFrame.astype>`
33-
34- """
34+
35+ """
36+
3537 def column_gap_count (ser ):
3638 s = ser .index .to_series ()
3739 tsout = ser .fillna (0 ).astype (dtype )
3840 miss = ser .isna ()
39- #create consecutive groups that increment each time the "is missing state" (na or not na) changes
41+ # create consecutive groups that increment each time the "is missing state" (na or not na) changes
4042 g = miss .ne (miss .shift ()).cumsum ()
4143
4244 # identify beginning (min time) of each state
4345 count = s .groupby (g ).count ()
44-
46+
4547 # g contains a group index for each member of out, and here
4648 # we map g to out which has cumulative time
4749 tsout = g .map (count )
@@ -50,15 +52,13 @@ def column_gap_count(ser):
5052 elif state == "good" :
5153 tsout .loc [miss ] = 0
5254 return tsout
53-
54- if hasattr (ts ,"columns" ):
55- return ts .apply (column_gap_count ,axis = 0 ,result_type = ' broadcast' ).astype (dtype )
55+
56+ if hasattr (ts , "columns" ):
57+ return ts .apply (column_gap_count , axis = 0 , result_type = " broadcast" ).astype (dtype )
5658 else :
5759 return column_gap_count (ts ).astype (dtype )
5860
5961
60-
61-
6262def gap_size (ts ):
6363 """
6464 Identifies gaps (runs of missing data) and quantifies the
@@ -67,18 +67,18 @@ def gap_size(ts):
6767 with non-missing data returning zero. Time is measured from the time the
6868 data first started being missing to when the data first starts being not missing
6969 .
70-
70+
7171 Parameters
7272 ----------
73-
73+
7474 ts : :class:`DataFrame <pandas:pandas.DataFrame>`
75-
75+
7676 Returns
7777 -------
7878 result : :class:`DataFrame <pandas:pandas.DataFrame>`
7979 A new regular time series with the same freq as the argument
80- holding the size of the gap.
81-
80+ holding the size of the gap.
81+
8282 Examples
8383 --------
8484 >>> ndx = pd.date_range(pd.Timestamp(2017,1,1,12),freq='15min',periods=10)
@@ -87,7 +87,7 @@ def gap_size(ts):
8787 >>> vals2 = np.arange(0.,10.,dtype='d')
8888 >>> vals0[0:3] = np.nan
8989 >>> vals0[7:-1] = np.nan
90- >>> vals1[2:4] = np.nan>>>
90+ >>> vals1[2:4] = np.nan>>>
9191 >>> vals1[6] = np.nan
9292 >>> vals1[9] = np.nan
9393
@@ -105,7 +105,7 @@ def gap_size(ts):
105105 2017-01-01 13:45:00 NaN 7.0 7.0
106106 2017-01-01 14:00:00 NaN 8.0 8.0
107107 2017-01-01 14:15:00 9.0 NaN 9.0
108- >>> print(out)
108+ >>> print(out)
109109 vals0 vals1 vals2
110110 2017-01-01 12:00:00 45.0 0.0 0.0
111111 2017-01-01 12:15:00 45.0 0.0 0.0
@@ -116,96 +116,93 @@ def gap_size(ts):
116116 2017-01-01 13:30:00 0.0 15.0 0.0
117117 2017-01-01 13:45:00 30.0 0.0 0.0
118118 2017-01-01 14:00:00 30.0 0.0 0.0
119- 2017-01-01 14:15:00 0.0 0.0 0.0
120-
119+ 2017-01-01 14:15:00 0.0 0.0 0.0
120+
121121 """
122-
123- ts_out = ts * 0.
124-
122+
123+ ts_out = ts * 0.0
124+
125125 s = ts .index .to_series ()
126126 for c in ts .columns :
127- #test missing values
127+ # test missing values
128128 miss = ts [c ].isna ()
129- #create consecutive groups that increment each time the "is missing state" (na or not na) changes
129+ # create consecutive groups that increment each time the "is missing state" (na or not na) changes
130130 g = miss .ne (miss .shift ()).cumsum ()
131131 # identify beginning (min time) of each state
132132 m1 = s .groupby (g ).min ()
133-
134- #get beginning of next groups, last value is replaced last value of index
133+
134+ # get beginning of next groups, last value is replaced last value of index
135135 m2 = m1 .shift (- 1 ).fillna (ts .index [- 1 ])
136136
137- #get difference, convert to minutes
137+ # get difference, convert to minutes
138138 diffs = m2 .sub (m1 ).dt .total_seconds ().div (60 ).astype (int )
139-
139+
140140 # g contains a group index for each member of out, and here
141141 # we map g to out which has cumulative time
142142 ts_out [c ] = g .map (diffs )
143- ts_out .loc [~ miss ,c ] = 0.
143+ ts_out .loc [~ miss , c ] = 0.0
144144 return ts_out
145145
146146
147-
148- def gap_distance (ts , disttype = "count" , to = "good" ):
149-
147+ def gap_distance (ts , disttype = "count" , to = "good" ):
150148 """
151149 For each element of ts, count the distance to the nearest good data/or bad data.
152-
150+
153151 Parameters
154152 ----------
155-
153+
156154 ts : :class:`DataFrame <pandas:pandas.DataFrame>`
157155 Time series to analyze
158-
156+
159157 disttype : `str` one of 'bad'|'good'
160158 If disttype = "count" this is the number of values. If dist_type="freq" it is in the units of ts.freq
161159 (so if freq == "15min" it is in minutes")
162-
160+
163161 to : `str` one of 'bad'|'good'
164-
162+
165163 If to = "good" this is the distance to the nearest good data (which is 0 for good data).
166- If to = "bad", this is the distance to the nearest nan (which is 0 for nan).
164+ If to = "bad", this is the distance to the nearest nan (which is 0 for nan).
167165
168166 Returns
169167 -------
170168 result : :class:`DataFrame <pandas:pandas.DataFrame>`
171169 A new regular time series with the same freq as the argument
172- holding the distance of good/bad data.
173-
170+ holding the distance of good/bad data.
171+
174172 """
175173 si = ts .index .to_series ()
176- ts_out = ts .to_frame () if isinstance (ts ,pd .Series ) else ts .copy ()
174+ ts_out = ts .to_frame () if isinstance (ts , pd .Series ) else ts .copy ()
177175 cols = ts_out .columns
178176 for col in cols :
179- id_key = True
180- #test missing values
177+ id_key = True
178+ # test missing values
181179 miss = ts_out [col ].isna ()
182- if to == "good" :
183- ts_out .at [~ miss ,col ]= 0
184- elif to == "bad" :
185- ts_out .at [miss ,col ]= 0
186- id_key = False
180+ if to == "good" :
181+ ts_out .at [~ miss , col ] = 0
182+ elif to == "bad" :
183+ ts_out .at [miss , col ] = 0
184+ id_key = False
187185 else :
188186 raise ValueError ("invalid input to, must be good or bad" )
189187
190-
191- if np .any (miss == (id_key )):
192- mm = si .groupby (miss ).indices
188+ if np .any (miss == (id_key )):
189+ mm = si .groupby (miss ).indices
193190 for i in mm [id_key ]:
194- #ts_out.iloc[i][col]=np.min(np.abs(i-mm[not(id_key)]))
195- ts_out .at [si [i ],col ]= np .min (np .abs (i - mm [not (id_key )]))
196-
197-
198- if disttype == "count" :
191+ # ts_out.iloc[i][col]=np.min(np.abs(i-mm[not(id_key)]))
192+ ts_out .at [si [i ], col ] = np .min (np .abs (i - mm [not (id_key )]))
193+
194+ if disttype == "count" :
199195 return ts_out
200- elif disttype == "freq" :
201- return ts_out * ts .index .freq
196+ elif disttype == "freq" :
197+ return ts_out * ts .index .freq
202198 else :
203199 raise ValueError ("invalid input disttype, must be count or freq" )
204200
205201
206202import pandas as pd
207203import numpy as np
208204
205+
209206def describe_series_gaps (s : pd .Series , name : str , context : int = 2 ):
210207 """
211208 Print gaps in a single Series s, showing `context` non-null points
@@ -221,7 +218,7 @@ def describe_series_gaps(s: pd.Series, name: str, context: int = 2):
221218 # find rising edges (gap starts) and falling edges (gap ends)
222219 diffs = np .diff (mask .astype (int ), prepend = 0 , append = 0 )
223220 starts = np .where (diffs == 1 )[0 ]
224- ends = np .where (diffs == - 1 )[0 ] - 1
221+ ends = np .where (diffs == - 1 )[0 ] - 1
225222
226223 for i , (st , en ) in enumerate (zip (starts , ends ), 1 ):
227224 gap_len = en - st + 1
@@ -253,6 +250,7 @@ def describe_series_gaps(s: pd.Series, name: str, context: int = 2):
253250 print (f" ← { idx [pi ]} : { s .iloc [pi ]} " )
254251 print ()
255252
253+
256254def describe_null (dset , name , context = 2 ):
257255 """
258256 If dset is a DataFrame, run describe_series_gaps on each column.
@@ -265,11 +263,11 @@ def describe_null(dset, name, context=2):
265263 describe_series_gaps (dset , name , context = context )
266264
267265
268-
269266def example_gap ():
270267 import numpy as np
271- ndx = pd .date_range (pd .Timestamp (2017 ,1 ,1 ,12 ),freq = '15min' ,periods = 10 )
272- vals0 = np .arange (0. ,10. ,dtype = 'd' )
268+
269+ ndx = pd .date_range (pd .Timestamp (2017 , 1 , 1 , 12 ), freq = "15min" , periods = 10 )
270+ vals0 = np .arange (0.0 , 10.0 , dtype = "d" )
273271 vals1 = vals0 .copy ()
274272 vals2 = vals0 .copy ()
275273 vals0 [0 :3 ] = np .nan
@@ -278,15 +276,15 @@ def example_gap():
278276 vals1 [6 ] = np .nan
279277 vals1 [9 ] = np .nan
280278
281- df = pd .DataFrame ({' vals0' : vals0 ,' vals1' : vals1 ,' vals2' : vals2 },index = ndx )
279+ df = pd .DataFrame ({" vals0" : vals0 , " vals1" : vals1 , " vals2" : vals2 }, index = ndx )
282280 out = gap_count (df )
283281 print (df )
284282 print (out )
285-
283+
286284 out = gap_distance (df )
287285 print ("**" )
288286 print (out )
289-
290- if __name__ == "__main__" :
291- example_gap ()
292-
287+
288+
289+ if __name__ == "__main__" :
290+ example_gap ()
0 commit comments