TEMP: har_trees: Try fix toothbrushing dataset

jonnor · jonnor · commit fbe332e9d757 · 2025-11-17T10:06:22.000+01:00
diff --git a/examples/har_trees/data/configurations/uci_har.yaml b/examples/har_trees/data/configurations/uci_har.yaml
@@ -5,6 +5,8 @@ data_columns:
   - acc_x
   - acc_y
   - acc_z
+features: 'custom'
+samplerate: 50
 classes:
   # - STAND_TO_LIE
   # - SIT_TO_LIE
diff --git a/examples/har_trees/har_train.py b/examples/har_trees/har_train.py
@@ -178,7 +178,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
 
             # Other options
             for k, v in self.options.items():
-                args += [ f'--{k}', v ]
+                args += [ f'--{k}', str(v) ]
 
             cmd = ' '.join(args)
             try:
@@ -194,8 +194,9 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
                 # TODO: support feature names. Separat output file, with --features
                 out = numpy.load(features_path)
                 windows = pandas.DataFrame(out)
+                # FIXME: support reading times, not infer
                 span = (data.index.max() - data.index.min()).total_seconds()
-                dt = span / len(windows) # XXX: make correct
+                dt = span / len(windows)
                 windows['time'] = dt * numpy.arange(len(windows))
             elif self.serialization == 'csv':
                 windows = pandas.read_csv(features_path)
@@ -208,7 +209,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
         time_in = data.index
         time_out = windows['time']
 
-        window_duration = pandas.Timedelta(5.0, unit='s') # XXX: hardcoded
+        window_duration = pandas.Timedelta(4.0, unit='s') # XXX: hardcoded
         start_delta = time_out.min() - time_in.min()
         assert abs(start_delta) <= window_duration, (start_delta, time_out.min(), time_in.min())
         end_delta = time_out.max() - time_in.max()
@@ -300,12 +301,29 @@ def process_one(idx, stream : pandas.DataFrame) -> pandas.DataFrame:
     # for any time-dependent logic to stabilize, and to merge while ignoring the run-in
     def split_sections(data, groupby : list[str], time_column='time'):
         groups = sensordata.groupby(groupby, observed=True)
-        for group_idx, group_df in groups:
+        for group_idx, df in groups:
 
             # ensure sorted by time
-            group_df = group_df.reset_index().set_index(time_column).sort_index()
+            df = df.reset_index()
+            print('d', df.columns)
 
-            yield group_idx, group_df
+            # convert to time-delta, if neeeded
+            if pandas.api.types.is_datetime64_dtype(df[time_column]):
+                df[time_column] = df[time_column] - df[time_column].min() 
+
+            df = df.set_index(time_column).sort_index()
+
+    
+            samplerate = 50
+            expected_freq = pandas.Timedelta(1/samplerate, unit='s')
+            diff = df.index.to_series().diff()
+            holes = diff[diff > expected_freq]
+            irregular = diff[diff != expected_freq].dropna()
+            assert irregular.empty, irregular
+
+            assert pandas.api.types.is_timedelta64_dtype(df.index)
+
+            yield group_idx, df
 
     sections = split_sections(sensordata, groupby=groupby, time_column=time_column)
     jobs = [ joblib.delayed(process_one)(idx, df) for idx, df in sections]
@@ -408,10 +426,18 @@ def run_pipeline(run, hyperparameters, dataset,
         features=features,
     )
     window_length = model_settings['window_length']
-    samplerate = model_settings.get('samplerate', 100)
-
+    samplerate = dataset_config.get('samplerate', 100)
+    window_hop = window_length
+    
     window_duration = (window_length / samplerate)
 
+    remap = {
+        'x': 'acc_x',
+        'y': 'acc_y',
+        'z': 'acc_z',
+    }
+    data = data.rename(columns=remap)
+
     # Setup feature extraction
     if features == 'timebased':
         columns = ['acc_x', 'acc_y', 'acc_z']
@@ -420,12 +446,13 @@ def run_pipeline(run, hyperparameters, dataset,
     elif features == 'custom':
         # FIXME: unhardcode path
         executable = ['/home/jon/projects/emlearn/examples/motion_recognition/build/motion_preprocess']
-        # FIXME: respect window_length, window_hop
         options = dict(
-            #window_length=window_length,
-            #window_hop=window_hop,
+            window_length=window_length,
+            hop_length=window_hop,
+            samplerate=samplerate,
         )
         columns = ['time', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']
+        data_columns = [ c for c in columns if not c == 'time' ]
         extractor = DataProcessorProgram(program=executable,
             options=options, column_order=columns)