Skip to content

Commit 8c1d188

Browse files
committed
har_trees: Try fix toothbrushing, support overlapped windows for timebased
1 parent 0c137a3 commit 8c1d188

3 files changed

Lines changed: 145 additions & 48 deletions

File tree

examples/har_trees/compute_features.py

Lines changed: 52 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,16 @@
88
from timebased import calculate_features_xyz, DATA_TYPECODE, N_FEATURES
99

1010
def compute_dataset_features(data: npyfile.Reader, window_length,
11+
hop_length=None,
1112
skip_samples=0, limit_samples=None, verbose=0):
1213

14+
if hop_length is None:
15+
hop_length = window_length
16+
17+
if window_length % hop_length != 0:
18+
raise ValueError(f"hop_length must be an even divisor of window_length. Got window={window_length} hop={hop_length}")
19+
20+
1321
# Check that data is expected format
1422
shape = data.shape
1523
assert len(shape) == 2, shape
@@ -25,31 +33,39 @@ def compute_dataset_features(data: npyfile.Reader, window_length,
2533
y_values = array.array(DATA_TYPECODE, (0 for _ in range(window_length)))
2634
z_values = array.array(DATA_TYPECODE, (0 for _ in range(window_length)))
2735

28-
chunk_size = window_length*n_axes
29-
sample_counter = 0
36+
chunk_size = hop_length*n_axes
37+
window_counter = 0
38+
start_idx = 0
3039

3140
data_chunks = data.read_data_chunks(chunk_size, offset=chunk_size*skip_samples)
41+
3242
for arr in data_chunks:
3343

3444
print('cc', len(arr))
3545
if len(arr) < chunk_size:
3646
# short read, last data piece, ignore
3747
continue
3848

39-
# process the data
49+
# Window was full, make room for more
50+
if start_idx >= window_length:
51+
overlap = window_length - hop_length
52+
if overlap > 0:
53+
x_values[:overlap] = x_values[hop_length:]
54+
y_values[:overlap] = y_values[hop_length:]
55+
z_values[:overlap] = z_values[hop_length:]
56+
start_idx = overlap
57+
58+
# Copy the input data
4059
# De-interleave data from XYZ1 XYZ2... into separate continious X,Y,Z
41-
for i in range(window_length):
60+
for i in range(hop_length):
4261
x_values[i] = arr[(i*3)+0]
4362
y_values[i] = arr[(i*3)+1]
4463
z_values[i] = arr[(i*3)+2]
64+
start_idx += hop_length
4565

46-
#print(x_values)
47-
#print(y_values)
48-
#print(z_values)
49-
50-
assert len(x_values) == window_length
51-
assert len(y_values) == window_length
52-
assert len(z_values) == window_length
66+
# waiting for window to fill
67+
if start_idx < window_length:
68+
continue
5369

5470
feature_calc_start = time.ticks_ms()
5571
features = calculate_features_xyz((x_values, y_values, z_values))
@@ -58,39 +74,47 @@ def compute_dataset_features(data: npyfile.Reader, window_length,
5874
print('feature-calc-end', duration)
5975

6076
yield features
77+
window_counter += 1
6178

62-
sample_counter += 1
63-
if limit_samples is not None and sample_counter > limit_samples:
79+
if limit_samples is not None and window_counter > limit_samples:
6480
break
6581

66-
def main():
82+
def parse():
83+
import argparse
6784

68-
if len(sys.argv) != 3:
69-
print('Usage: compute_features.py IN.npy OUT.npy')
85+
parser = argparse.ArgumentParser(description='Compute features from NPY file')
86+
parser.add_argument('--input', required=True, help='Input .npy file')
87+
parser.add_argument('--output', required=True, help='Output .npy file')
88+
parser.add_argument('--samplerate', type=int, default=None, help='Samplerate. Currently ignored')
89+
parser.add_argument('--skip', type=int, default=0, help='Number of samples to skip (default: 0)')
90+
parser.add_argument('--limit', type=int, default=None, help='Maximum number of samples to process (default: None)')
91+
parser.add_argument('--window_length', type=int, default=128, help='Window length (default: 128)')
92+
parser.add_argument('--hop_length', type=int, default=None, help='Hop length (default: window_length)')
7093

71-
_, in_path, out_path = sys.argv
94+
args = parser.parse_args()
95+
return args
96+
97+
def main():
7298

73-
skip_samples = 0
74-
limit_samples = None
99+
args = parse()
75100

76101
out_typecode = 'f'
77-
n_features = N_FEATURES
78-
window_length = 128
79-
102+
n_features = N_FEATURES
80103
features_array = array.array(out_typecode, (0 for _ in range(n_features)))
81104

82-
with npyfile.Reader(in_path) as data:
105+
with npyfile.Reader(args.input) as data:
83106
n_samples, n_axes = data.shape
84107

85-
n_windows = n_samples // window_length
108+
n_windows = (n_samples - args.window_length) // args.hop_length
86109

87110
out_shape = (n_windows, n_features)
88-
with npyfile.Writer(out_path, out_shape, out_typecode) as out:
111+
with npyfile.Writer(args.output, out_shape, out_typecode) as out:
89112

90113
generator = compute_dataset_features(data,
91-
window_length=window_length,
92-
skip_samples=skip_samples,
93-
limit_samples=limit_samples,
114+
window_length=args.window_length,
115+
hop_length=args.hop_length,
116+
skip_samples=args.skip,
117+
limit_samples=args.limit,
94118
)
95119
for features in generator:
96120
#print('features', len(features), features)

examples/har_trees/data/configurations/uci_har.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ data_columns:
55
- acc_x
66
- acc_y
77
- acc_z
8+
features: 'custom'
9+
samplerate: 50
810
classes:
911
# - STAND_TO_LIE
1012
# - SIT_TO_LIE

examples/har_trees/har_train.py

Lines changed: 91 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
178178

179179
# Other options
180180
for k, v in self.options.items():
181-
args += [ f'--{k}', v ]
181+
args += [ f'--{k}', str(v) ]
182182

183183
cmd = ' '.join(args)
184184
try:
@@ -194,11 +194,16 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
194194
# TODO: support feature names. Separat output file, with --features
195195
out = numpy.load(features_path)
196196
windows = pandas.DataFrame(out)
197+
# FIXME: support reading times, not infer
197198
span = (data.index.max() - data.index.min()).total_seconds()
198-
dt = span / len(windows) # XXX: make correct
199+
dt = span / len(windows)
200+
log.debug('preprocess', windows=len(windows), dt=dt)
199201
windows['time'] = dt * numpy.arange(len(windows))
200202
elif self.serialization == 'csv':
201203
windows = pandas.read_csv(features_path)
204+
span = (data.index.max() - data.index.min()).total_seconds()
205+
dt = span / len(windows)
206+
log.debug('preprocess', windows=len(windows), dt=dt)
202207
else:
203208
raise NotImplementedError(self.serialization)
204209

@@ -208,7 +213,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
208213
time_in = data.index
209214
time_out = windows['time']
210215

211-
window_duration = pandas.Timedelta(5.0, unit='s') # XXX: hardcoded
216+
window_duration = pandas.Timedelta(4.0, unit='s') # XXX: hardcoded
212217
start_delta = time_out.min() - time_in.min()
213218
assert abs(start_delta) <= window_duration, (start_delta, time_out.min(), time_in.min())
214219
end_delta = time_out.max() - time_in.max()
@@ -219,7 +224,7 @@ def process(self, data : pandas.DataFrame) -> pandas.DataFrame:
219224
class TimebasedFeatureExtractor(DataProcessorProgram):
220225

221226
def __init__(self, python_bin='python', **kwargs):
222-
super().__init__(self, input_option='', output_option='', serialization='npy', **kwargs)
227+
super().__init__(self, serialization='npy', **kwargs)
223228

224229
here = os.path.dirname(__file__)
225230
feature_extraction_script = os.path.join(here, 'compute_features.py')
@@ -234,6 +239,7 @@ def extract_features(sensordata : pandas.DataFrame,
234239
columns : list[str],
235240
groupby : list[str],
236241
extractor,
242+
samplerate = 50,
237243
sensitivity = 2.0, # how many g range the int16 sensor data has
238244
label_column='activity',
239245
time_column='time',
@@ -300,12 +306,44 @@ def process_one(idx, stream : pandas.DataFrame) -> pandas.DataFrame:
300306
# for any time-dependent logic to stabilize, and to merge while ignoring the run-in
301307
def split_sections(data, groupby : list[str], time_column='time'):
302308
groups = sensordata.groupby(groupby, observed=True)
303-
for group_idx, group_df in groups:
309+
for group_idx, df in groups:
304310

305311
# ensure sorted by time
306-
group_df = group_df.reset_index().set_index(time_column).sort_index()
312+
df = df.reset_index()
313+
314+
# convert to time-delta, if neeeded
315+
if pandas.api.types.is_datetime64_dtype(df[time_column]):
316+
df[time_column] = df[time_column] - df[time_column].min()
317+
318+
df = df.set_index(time_column).sort_index()
319+
320+
expected_freq = pandas.Timedelta(1/samplerate, unit='s')
321+
diff = df.index.to_series().diff()
322+
holes = diff[diff > expected_freq]
323+
irregular = diff[diff != expected_freq].dropna()
324+
325+
# Convert to regular time-series
326+
times = pandas.timedelta_range(df.index.min(), df.index.max(), freq=expected_freq)
327+
df = df.reindex(times)
328+
329+
missing = df[columns].isna().any(axis=1)
330+
missing_ratio = numpy.count_nonzero(missing) / len(df)
331+
if missing_ratio > 0.01:
332+
log.debug('section-missing-data',
333+
idx=group_idx,
334+
rows=len(df[missing]),
335+
ratio=missing_ratio,
336+
irregular=len(irregular),
337+
)
307338

308-
yield group_idx, group_df
339+
# Fill holes (if any)
340+
df = df.ffill()
341+
342+
assert pandas.api.types.is_timedelta64_dtype(df.index)
343+
344+
df[time_column] = df.index
345+
346+
yield group_idx, df
309347

310348
sections = split_sections(sensordata, groupby=groupby, time_column=time_column)
311349
jobs = [ joblib.delayed(process_one)(idx, df) for idx, df in sections]
@@ -351,9 +389,21 @@ def label_windows(sensordata,
351389
# default to unknown=NA
352390
windows[label_column] = None
353391

392+
print(sensordata.head())
393+
394+
sensor_groups = {idx: df for idx, df in sensordata.groupby(groupby, group_keys=False, as_index=False) }
395+
396+
log.debug('label-windows', groups=groupby, g=list(sensor_groups.keys()))
397+
354398
for idx, ww in windows.groupby(groupby):
355-
data = sensordata.loc[idx]
356-
399+
data = sensor_groups[idx]
400+
#log.debug('label-window', idx=idx, index_dtype=data.index.dtype)
401+
data = data.reset_index().set_index('time') # XXX: Why is this needed?
402+
403+
# convert to time-delta, if neeeded
404+
if pandas.api.types.is_datetime64_dtype(data.index):
405+
data.index = data.index - data.index.min()
406+
357407
for idx, w in ww.iterrows():
358408
window_end = idx[-1] # XXX: assuming this is time
359409
window_start = window_end - window_duration
@@ -397,6 +447,9 @@ def run_pipeline(run, hyperparameters, dataset,
397447
time_column = dataset_config.get('time_column', 'time')
398448
sensitivity = dataset_config.get('sensitivity', 4.0)
399449

450+
print('dd', sorted(data.columns))
451+
print('dt', data.dtypes)
452+
400453
data[label_column] = data[label_column].astype(str)
401454

402455
data_load_duration = time.time() - data_load_start
@@ -408,26 +461,43 @@ def run_pipeline(run, hyperparameters, dataset,
408461
features=features,
409462
)
410463
window_length = model_settings['window_length']
411-
samplerate = model_settings.get('samplerate', 100)
412-
464+
samplerate = dataset_config.get('samplerate', 100)
465+
window_hop = model_settings['window_hop']
466+
413467
window_duration = (window_length / samplerate)
414468

469+
remap = {
470+
'x': 'acc_x',
471+
'y': 'acc_y',
472+
'z': 'acc_z',
473+
}
474+
if features == 'timebased':
475+
# XXX: hack
476+
remap = {
477+
'acc_x': 'x',
478+
'acc_y': 'y',
479+
'acc_z': 'z',
480+
}
481+
data = data.rename(columns=remap)
482+
415483
# Setup feature extraction
484+
extract_options = dict(
485+
window_length=window_length,
486+
hop_length=window_hop,
487+
samplerate=samplerate,
488+
)
416489
if features == 'timebased':
417-
columns = ['acc_x', 'acc_y', 'acc_z']
418-
extractor = TimebasedFeatureExtractor(column_order=columns)
490+
columns = ['x', 'y', 'z']
491+
extractor = TimebasedFeatureExtractor(column_order=columns, options=extract_options)
419492

420493
elif features == 'custom':
421494
# FIXME: unhardcode path
422495
executable = ['/home/jon/projects/emlearn/examples/motion_recognition/build/motion_preprocess']
423-
# FIXME: respect window_length, window_hop
424-
options = dict(
425-
#window_length=window_length,
426-
#window_hop=window_hop,
427-
)
496+
428497
columns = ['time', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']
498+
data_columns = [ c for c in columns if not c == 'time' ]
429499
extractor = DataProcessorProgram(program=executable,
430-
options=options, column_order=columns)
500+
options=extract_options, column_order=columns)
431501

432502
# Feature extractor expects these to be set
433503
data['gyro_x'] = 0.0
@@ -443,6 +513,7 @@ def run_pipeline(run, hyperparameters, dataset,
443513
sensitivity=sensitivity,
444514
label_column=label_column,
445515
time_column=time_column,
516+
samplerate=samplerate,
446517
)
447518

448519
# Attach labels
@@ -452,7 +523,7 @@ def run_pipeline(run, hyperparameters, dataset,
452523
window_duration=pandas.Timedelta(window_duration, unit='s'),
453524
)
454525

455-
print(features.columns)
526+
print(features.head())
456527

457528
labeled = numpy.count_nonzero(features[label_column].notna())
458529

0 commit comments

Comments
 (0)