har_trees: Try fix toothbrushing, support overlapped windows for timebased

jonnor · jonnor · commit 8c1d1884aa97 · 2025-11-22T23:54:41.000+01:00
diff --git a/examples/har_trees/compute_features.py b/examples/har_trees/compute_features.py
@@ -8,8 +8,16 @@
 from timebased import calculate_features_xyz, DATA_TYPECODE, N_FEATURES
 
 def compute_dataset_features(data: npyfile.Reader, window_length,
+        hop_length=None,
         skip_samples=0, limit_samples=None, verbose=0):
 
+    if hop_length is None:
+        hop_length = window_length
+
+    if window_length % hop_length != 0:
+        raise ValueError(f"hop_length must be an even divisor of window_length. Got window={window_length} hop={hop_length}")
+
+
     # Check that data is expected format
     shape = data.shape
     assert len(shape) == 2, shape
@@ -25,31 +33,39 @@ def compute_dataset_features(data: npyfile.Reader, window_length,
     y_values = array.array(DATA_TYPECODE, (0 for _ in range(window_length)))
     z_values = array.array(DATA_TYPECODE, (0 for _ in range(window_length)))
 
-    chunk_size = window_length*n_axes
-    sample_counter = 0
+    chunk_size = hop_length*n_axes
+    window_counter = 0
+    start_idx = 0
 
     data_chunks = data.read_data_chunks(chunk_size, offset=chunk_size*skip_samples)
+
     for arr in data_chunks:
 
         print('cc', len(arr))
         if len(arr) < chunk_size:
             # short read, last data piece, ignore
             continue
 
-        # process the data
+        # Window was full, make room for more
+        if start_idx >= window_length:
+            overlap = window_length - hop_length
+            if overlap > 0:
+                x_values[:overlap] = x_values[hop_length:]
+                y_values[:overlap] = y_values[hop_length:]
+                z_values[:overlap] = z_values[hop_length:]
+            start_idx = overlap
+
+        # Copy the input data
         # De-interleave data from XYZ1 XYZ2... into separate continious X,Y,Z
-        for i in range(window_length):
+        for i in range(hop_length):
             x_values[i] = arr[(i*3)+0]
             y_values[i] = arr[(i*3)+1]
             z_values[i] = arr[(i*3)+2]
+        start_idx += hop_length
 
-        #print(x_values)
-        #print(y_values)
-        #print(z_values)
-
-        assert len(x_values) == window_length
-        assert len(y_values) == window_length
-        assert len(z_values) == window_length
+        # waiting for window to fill
+        if start_idx < window_length:
+            continue
 
         feature_calc_start = time.ticks_ms()
         features = calculate_features_xyz((x_values, y_values, z_values))
@@ -58,39 +74,47 @@ def compute_dataset_features(data: npyfile.Reader, window_length,
             print('feature-calc-end', duration)
 
         yield features
+        window_counter += 1
 
-        sample_counter += 1
-        if limit_samples is not None and sample_counter > limit_samples:
+        if limit_samples is not None and window_counter > limit_samples:
             break
 
-def main():
+def parse():
+    import argparse
 
-    if len(sys.argv) != 3:
-        print('Usage: compute_features.py IN.npy OUT.npy')
+    parser = argparse.ArgumentParser(description='Compute features from NPY file')
+    parser.add_argument('--input', required=True, help='Input .npy file')
+    parser.add_argument('--output', required=True, help='Output .npy file')
+    parser.add_argument('--samplerate', type=int, default=None, help='Samplerate. Currently ignored')
+    parser.add_argument('--skip', type=int, default=0, help='Number of samples to skip (default: 0)')
+    parser.add_argument('--limit', type=int, default=None, help='Maximum number of samples to process (default: None)')
+    parser.add_argument('--window_length', type=int, default=128, help='Window length (default: 128)')
+    parser.add_argument('--hop_length', type=int, default=None, help='Hop length (default: window_length)')
 
-    _, in_path, out_path = sys.argv
+    args = parser.parse_args()
+    return args
+
+def main():
 
-    skip_samples = 0
-    limit_samples = None
+    args = parse()
 
     out_typecode = 'f'
-    n_features = N_FEATURES
-    window_length = 128
-    
+    n_features = N_FEATURES    
     features_array = array.array(out_typecode, (0 for _ in range(n_features)))
 
-    with npyfile.Reader(in_path) as data:
+    with npyfile.Reader(args.input) as data:
         n_samples, n_axes = data.shape
 
-        n_windows = n_samples // window_length
+        n_windows = (n_samples - args.window_length) // args.hop_length
 
         out_shape = (n_windows, n_features)
-        with npyfile.Writer(out_path, out_shape, out_typecode) as out:
+        with npyfile.Writer(args.output, out_shape, out_typecode) as out:
 
             generator = compute_dataset_features(data,
-                window_length=window_length,
-                skip_samples=skip_samples,
-                limit_samples=limit_samples,
+                window_length=args.window_length,
+                hop_length=args.hop_length,
+                skip_samples=args.skip,
+                limit_samples=args.limit,
             )
             for features in generator:
                 #print('features', len(features), features)
diff --git a/examples/har_trees/data/configurations/uci_har.yaml b/examples/har_trees/data/configurations/uci_har.yaml
@@ -5,6 +5,8 @@ data_columns:
   - acc_x
   - acc_y
   - acc_z
+features: 'custom'
+samplerate: 50
 classes:
   # - STAND_TO_LIE
   # - SIT_TO_LIE
diff --git a/examples/har_trees/har_train.py b/examples/har_trees/har_train.py
@@ -178,7 +178,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
 
             # Other options
             for k, v in self.options.items():
-                args += [ f'--{k}', v ]
+                args += [ f'--{k}', str(v) ]
 
             cmd = ' '.join(args)
             try:
@@ -194,11 +194,16 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
                 # TODO: support feature names. Separat output file, with --features
                 out = numpy.load(features_path)
                 windows = pandas.DataFrame(out)
+                # FIXME: support reading times, not infer
                 span = (data.index.max() - data.index.min()).total_seconds()
-                dt = span / len(windows) # XXX: make correct
+                dt = span / len(windows)
+                log.debug('preprocess', windows=len(windows), dt=dt)
                 windows['time'] = dt * numpy.arange(len(windows))
             elif self.serialization == 'csv':
                 windows = pandas.read_csv(features_path)
+                span = (data.index.max() - data.index.min()).total_seconds()
+                dt = span / len(windows)
+                log.debug('preprocess', windows=len(windows), dt=dt)
             else:
                 raise NotImplementedError(self.serialization)
 
@@ -208,7 +213,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
         time_in = data.index
         time_out = windows['time']
 
-        window_duration = pandas.Timedelta(5.0, unit='s') # XXX: hardcoded
+        window_duration = pandas.Timedelta(4.0, unit='s') # XXX: hardcoded
         start_delta = time_out.min() - time_in.min()
         assert abs(start_delta) <= window_duration, (start_delta, time_out.min(), time_in.min())
         end_delta = time_out.max() - time_in.max()
@@ -219,7 +224,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
 class TimebasedFeatureExtractor(DataProcessorProgram):
 
     def __init__(self, python_bin='python', **kwargs):
-        super().__init__(self, input_option='', output_option='', serialization='npy', **kwargs)
+        super().__init__(self, serialization='npy', **kwargs)
 
         here = os.path.dirname(__file__)
         feature_extraction_script = os.path.join(here, 'compute_features.py')
@@ -234,6 +239,7 @@ def extract_features(sensordata : pandas.DataFrame,
     columns : list[str],
     groupby : list[str],
     extractor,
+    samplerate = 50,
     sensitivity = 2.0, # how many g range the int16 sensor data has
     label_column='activity',
     time_column='time',
@@ -300,12 +306,44 @@ def process_one(idx, stream : pandas.DataFrame) -> pandas.DataFrame:
     # for any time-dependent logic to stabilize, and to merge while ignoring the run-in
     def split_sections(data, groupby : list[str], time_column='time'):
         groups = sensordata.groupby(groupby, observed=True)
-        for group_idx, group_df in groups:
+        for group_idx, df in groups:
 
             # ensure sorted by time
-            group_df = group_df.reset_index().set_index(time_column).sort_index()
+            df = df.reset_index()
+
+            # convert to time-delta, if neeeded
+            if pandas.api.types.is_datetime64_dtype(df[time_column]):
+                df[time_column] = df[time_column] - df[time_column].min() 
+
+            df = df.set_index(time_column).sort_index()
+
+            expected_freq = pandas.Timedelta(1/samplerate, unit='s')
+            diff = df.index.to_series().diff()
+            holes = diff[diff > expected_freq]
+            irregular = diff[diff != expected_freq].dropna()
+
+            # Convert to regular time-series
+            times = pandas.timedelta_range(df.index.min(), df.index.max(), freq=expected_freq)
+            df = df.reindex(times)
+
+            missing = df[columns].isna().any(axis=1)
+            missing_ratio = numpy.count_nonzero(missing) / len(df)
+            if missing_ratio > 0.01:
+                log.debug('section-missing-data',
+                    idx=group_idx,
+                    rows=len(df[missing]),
+                    ratio=missing_ratio,
+                    irregular=len(irregular),
+                )
 
-            yield group_idx, group_df
+            # Fill holes (if any)
+            df = df.ffill()
+
+            assert pandas.api.types.is_timedelta64_dtype(df.index)
+
+            df[time_column] = df.index
+
+            yield group_idx, df
 
     sections = split_sections(sensordata, groupby=groupby, time_column=time_column)
     jobs = [ joblib.delayed(process_one)(idx, df) for idx, df in sections]
@@ -351,9 +389,21 @@ def label_windows(sensordata,
     # default to unknown=NA
     windows[label_column] = None
 
+    print(sensordata.head())
+
+    sensor_groups = {idx: df for idx, df in sensordata.groupby(groupby, group_keys=False, as_index=False) }
+
+    log.debug('label-windows', groups=groupby, g=list(sensor_groups.keys()))
+
     for idx, ww in windows.groupby(groupby):
-        data = sensordata.loc[idx]
-        
+        data = sensor_groups[idx]
+        #log.debug('label-window', idx=idx, index_dtype=data.index.dtype)        
+        data = data.reset_index().set_index('time') # XXX: Why is this needed?
+
+        # convert to time-delta, if neeeded
+        if pandas.api.types.is_datetime64_dtype(data.index):
+            data.index = data.index - data.index.min() 
+
         for idx, w in ww.iterrows():
             window_end = idx[-1] # XXX: assuming this is time
             window_start = window_end - window_duration
@@ -397,6 +447,9 @@ def run_pipeline(run, hyperparameters, dataset,
     time_column = dataset_config.get('time_column', 'time')
     sensitivity = dataset_config.get('sensitivity', 4.0)
 
+    print('dd', sorted(data.columns))
+    print('dt', data.dtypes)
+
     data[label_column] = data[label_column].astype(str)
 
     data_load_duration = time.time() - data_load_start
@@ -408,26 +461,43 @@ def run_pipeline(run, hyperparameters, dataset,
         features=features,
     )
     window_length = model_settings['window_length']
-    samplerate = model_settings.get('samplerate', 100)
-
+    samplerate = dataset_config.get('samplerate', 100)
+    window_hop = model_settings['window_hop']
+    
     window_duration = (window_length / samplerate)
 
+    remap = {
+        'x': 'acc_x',
+        'y': 'acc_y',
+        'z': 'acc_z',
+    }
+    if features == 'timebased':
+        # XXX: hack
+        remap = {
+            'acc_x': 'x',
+            'acc_y': 'y',
+            'acc_z': 'z',
+        }
+    data = data.rename(columns=remap)
+
     # Setup feature extraction
+    extract_options = dict(
+        window_length=window_length,
+        hop_length=window_hop,
+        samplerate=samplerate,
+    )
     if features == 'timebased':
-        columns = ['acc_x', 'acc_y', 'acc_z']
-        extractor = TimebasedFeatureExtractor(column_order=columns)
+        columns = ['x', 'y', 'z']
+        extractor = TimebasedFeatureExtractor(column_order=columns, options=extract_options)
 
     elif features == 'custom':
         # FIXME: unhardcode path
         executable = ['/home/jon/projects/emlearn/examples/motion_recognition/build/motion_preprocess']
-        # FIXME: respect window_length, window_hop
-        options = dict(
-            #window_length=window_length,
-            #window_hop=window_hop,
-        )
+
         columns = ['time', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']
+        data_columns = [ c for c in columns if not c == 'time' ]
         extractor = DataProcessorProgram(program=executable,
-            options=options, column_order=columns)
+            options=extract_options, column_order=columns)
 
         # Feature extractor expects these to be set
         data['gyro_x'] = 0.0
@@ -443,6 +513,7 @@ def run_pipeline(run, hyperparameters, dataset,
         sensitivity=sensitivity,
         label_column=label_column,
         time_column=time_column,
+        samplerate=samplerate,
     )
 
     # Attach labels
@@ -452,7 +523,7 @@ def run_pipeline(run, hyperparameters, dataset,
         window_duration=pandas.Timedelta(window_duration, unit='s'),
     )
 
-    print(features.columns)
+    print(features.head())
 
     labeled = numpy.count_nonzero(features[label_column].notna())