@@ -178,7 +178,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
178178
179179 # Other options
180180 for k , v in self .options .items ():
181- args += [ f'--{ k } ' , v ]
181+ args += [ f'--{ k } ' , str ( v ) ]
182182
183183 cmd = ' ' .join (args )
184184 try :
@@ -194,11 +194,16 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
194194 # TODO: support feature names. Separat output file, with --features
195195 out = numpy .load (features_path )
196196 windows = pandas .DataFrame (out )
197+ # FIXME: support reading times, not infer
197198 span = (data .index .max () - data .index .min ()).total_seconds ()
198- dt = span / len (windows ) # XXX: make correct
199+ dt = span / len (windows )
200+ log .debug ('preprocess' , windows = len (windows ), dt = dt )
199201 windows ['time' ] = dt * numpy .arange (len (windows ))
200202 elif self .serialization == 'csv' :
201203 windows = pandas .read_csv (features_path )
204+ span = (data .index .max () - data .index .min ()).total_seconds ()
205+ dt = span / len (windows )
206+ log .debug ('preprocess' , windows = len (windows ), dt = dt )
202207 else :
203208 raise NotImplementedError (self .serialization )
204209
@@ -208,7 +213,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
208213 time_in = data .index
209214 time_out = windows ['time' ]
210215
211- window_duration = pandas .Timedelta (5 .0 , unit = 's' ) # XXX: hardcoded
216+ window_duration = pandas .Timedelta (4 .0 , unit = 's' ) # XXX: hardcoded
212217 start_delta = time_out .min () - time_in .min ()
213218 assert abs (start_delta ) <= window_duration , (start_delta , time_out .min (), time_in .min ())
214219 end_delta = time_out .max () - time_in .max ()
@@ -219,7 +224,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
219224class TimebasedFeatureExtractor (DataProcessorProgram ):
220225
221226 def __init__ (self , python_bin = 'python' , ** kwargs ):
222- super ().__init__ (self , input_option = '' , output_option = '' , serialization = 'npy' , ** kwargs )
227+ super ().__init__ (self , serialization = 'npy' , ** kwargs )
223228
224229 here = os .path .dirname (__file__ )
225230 feature_extraction_script = os .path .join (here , 'compute_features.py' )
@@ -234,6 +239,7 @@ def extract_features(sensordata : pandas.DataFrame,
234239 columns : list [str ],
235240 groupby : list [str ],
236241 extractor ,
242+ samplerate = 50 ,
237243 sensitivity = 2.0 , # how many g range the int16 sensor data has
238244 label_column = 'activity' ,
239245 time_column = 'time' ,
@@ -300,12 +306,44 @@ def process_one(idx, stream : pandas.DataFrame) -> pandas.DataFrame:
300306 # for any time-dependent logic to stabilize, and to merge while ignoring the run-in
301307 def split_sections (data , groupby : list [str ], time_column = 'time' ):
302308 groups = sensordata .groupby (groupby , observed = True )
303- for group_idx , group_df in groups :
309+ for group_idx , df in groups :
304310
305311 # ensure sorted by time
306- group_df = group_df .reset_index ().set_index (time_column ).sort_index ()
312+ df = df .reset_index ()
313+
314+ # convert to time-delta, if neeeded
315+ if pandas .api .types .is_datetime64_dtype (df [time_column ]):
316+ df [time_column ] = df [time_column ] - df [time_column ].min ()
317+
318+ df = df .set_index (time_column ).sort_index ()
319+
320+ expected_freq = pandas .Timedelta (1 / samplerate , unit = 's' )
321+ diff = df .index .to_series ().diff ()
322+ holes = diff [diff > expected_freq ]
323+ irregular = diff [diff != expected_freq ].dropna ()
324+
325+ # Convert to regular time-series
326+ times = pandas .timedelta_range (df .index .min (), df .index .max (), freq = expected_freq )
327+ df = df .reindex (times )
328+
329+ missing = df [columns ].isna ().any (axis = 1 )
330+ missing_ratio = numpy .count_nonzero (missing ) / len (df )
331+ if missing_ratio > 0.01 :
332+ log .debug ('section-missing-data' ,
333+ idx = group_idx ,
334+ rows = len (df [missing ]),
335+ ratio = missing_ratio ,
336+ irregular = len (irregular ),
337+ )
307338
308- yield group_idx , group_df
339+ # Fill holes (if any)
340+ df = df .ffill ()
341+
342+ assert pandas .api .types .is_timedelta64_dtype (df .index )
343+
344+ df [time_column ] = df .index
345+
346+ yield group_idx , df
309347
310348 sections = split_sections (sensordata , groupby = groupby , time_column = time_column )
311349 jobs = [ joblib .delayed (process_one )(idx , df ) for idx , df in sections ]
@@ -351,9 +389,21 @@ def label_windows(sensordata,
351389 # default to unknown=NA
352390 windows [label_column ] = None
353391
392+ print (sensordata .head ())
393+
394+ sensor_groups = {idx : df for idx , df in sensordata .groupby (groupby , group_keys = False , as_index = False ) }
395+
396+ log .debug ('label-windows' , groups = groupby , g = list (sensor_groups .keys ()))
397+
354398 for idx , ww in windows .groupby (groupby ):
355- data = sensordata .loc [idx ]
356-
399+ data = sensor_groups [idx ]
400+ #log.debug('label-window', idx=idx, index_dtype=data.index.dtype)
401+ data = data .reset_index ().set_index ('time' ) # XXX: Why is this needed?
402+
403+ # convert to time-delta, if neeeded
404+ if pandas .api .types .is_datetime64_dtype (data .index ):
405+ data .index = data .index - data .index .min ()
406+
357407 for idx , w in ww .iterrows ():
358408 window_end = idx [- 1 ] # XXX: assuming this is time
359409 window_start = window_end - window_duration
@@ -397,6 +447,9 @@ def run_pipeline(run, hyperparameters, dataset,
397447 time_column = dataset_config .get ('time_column' , 'time' )
398448 sensitivity = dataset_config .get ('sensitivity' , 4.0 )
399449
450+ print ('dd' , sorted (data .columns ))
451+ print ('dt' , data .dtypes )
452+
400453 data [label_column ] = data [label_column ].astype (str )
401454
402455 data_load_duration = time .time () - data_load_start
@@ -408,26 +461,43 @@ def run_pipeline(run, hyperparameters, dataset,
408461 features = features ,
409462 )
410463 window_length = model_settings ['window_length' ]
411- samplerate = model_settings .get ('samplerate' , 100 )
412-
464+ samplerate = dataset_config .get ('samplerate' , 100 )
465+ window_hop = model_settings ['window_hop' ]
466+
413467 window_duration = (window_length / samplerate )
414468
469+ remap = {
470+ 'x' : 'acc_x' ,
471+ 'y' : 'acc_y' ,
472+ 'z' : 'acc_z' ,
473+ }
474+ if features == 'timebased' :
475+ # XXX: hack
476+ remap = {
477+ 'acc_x' : 'x' ,
478+ 'acc_y' : 'y' ,
479+ 'acc_z' : 'z' ,
480+ }
481+ data = data .rename (columns = remap )
482+
415483 # Setup feature extraction
484+ extract_options = dict (
485+ window_length = window_length ,
486+ hop_length = window_hop ,
487+ samplerate = samplerate ,
488+ )
416489 if features == 'timebased' :
417- columns = ['acc_x ' , 'acc_y ' , 'acc_z ' ]
418- extractor = TimebasedFeatureExtractor (column_order = columns )
490+ columns = ['x ' , 'y ' , 'z ' ]
491+ extractor = TimebasedFeatureExtractor (column_order = columns , options = extract_options )
419492
420493 elif features == 'custom' :
421494 # FIXME: unhardcode path
422495 executable = ['/home/jon/projects/emlearn/examples/motion_recognition/build/motion_preprocess' ]
423- # FIXME: respect window_length, window_hop
424- options = dict (
425- #window_length=window_length,
426- #window_hop=window_hop,
427- )
496+
428497 columns = ['time' , 'acc_x' , 'acc_y' , 'acc_z' , 'gyro_x' , 'gyro_y' , 'gyro_z' ]
498+ data_columns = [ c for c in columns if not c == 'time' ]
429499 extractor = DataProcessorProgram (program = executable ,
430- options = options , column_order = columns )
500+ options = extract_options , column_order = columns )
431501
432502 # Feature extractor expects these to be set
433503 data ['gyro_x' ] = 0.0
@@ -443,6 +513,7 @@ def run_pipeline(run, hyperparameters, dataset,
443513 sensitivity = sensitivity ,
444514 label_column = label_column ,
445515 time_column = time_column ,
516+ samplerate = samplerate ,
446517 )
447518
448519 # Attach labels
@@ -452,7 +523,7 @@ def run_pipeline(run, hyperparameters, dataset,
452523 window_duration = pandas .Timedelta (window_duration , unit = 's' ),
453524 )
454525
455- print (features .columns )
526+ print (features .head () )
456527
457528 labeled = numpy .count_nonzero (features [label_column ].notna ())
458529
0 commit comments