SensorFusionKnowledgeDistillationCompositeHAR/run_preprocessing.py at main · Tiny-Composite-and-Complex-ADL/SensorFusionKnowledgeDistillationCompositeHAR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
Data Preprocessing Pipeline -
    * Checks/Downloads raw data from Google Drive.
    * Reads, normalizes, and windows the data.
    * Generates subject-dependent splits.
    * Saves processed artifacts to the location defined in config.py.
"""

import os
import numpy as np
from config import DATA_DIR, TIMESTEPS, OVERLAP_SAMPLES
from data_pipeline.downloader import download_and_extract_data
from data_pipeline.processor import load_raw_csvs, preprocess_features, generate_splits

# Define where raw data downloads to (can be temporary or permanent)
# We assume the zip extracts into a folder structure.
# Adjust 'RAW_DATA_ROOT' if you want it elsewhere.
RAW_DATA_ROOT = DATA_DIR


def main():
    print("Starting Data Preprocessing Pipeline...")

    print("\n[Step 1] Checking Raw Data Source...")
    download_and_extract_data(RAW_DATA_ROOT)

    print("\n[Step 2] processing CSV files...")
    # Note: load_raw_csvs recursively searches, so we point it to the downloaded root
    raw_df = load_raw_csvs(RAW_DATA_ROOT)

    print("\n[Step 3] Normalizing and Encoding...")
    processed_df = preprocess_features(raw_df)

    print(f"\n[Step 4] Generating Windows (Size: {TIMESTEPS}) and Splits...")
    # This matches the 'subject dependent' logic from your original code
    X_train, y_train, X_test, y_test = generate_splits(
        processed_df,
        train_ratio=0.8,
        window_size=TIMESTEPS,
        step_size=OVERLAP_SAMPLES
    )

    print(f"  -> Train shape: {X_train.shape}")
    print(f"  -> Test shape:  {X_test.shape}")

    # We use DATA_DIR from config.py to ensure the training scripts find it
    print(f"\n[Step 5] Saving datasets to {DATA_DIR}...")
    os.makedirs(DATA_DIR, exist_ok=True)

    # Save as float32 for consistency with TFLite pipeline later
    np.savetxt(os.path.join(DATA_DIR, "X_train.txt"), X_train, fmt='%.6f')
    np.savetxt(os.path.join(DATA_DIR, "y_train.txt"), y_train, fmt='%d')
    np.savetxt(os.path.join(DATA_DIR, "X_test.txt"), X_test, fmt='%.6f')
    np.savetxt(os.path.join(DATA_DIR, "y_test.txt"), y_test, fmt='%d')

    print("\n=== Preprocessing Complete! ===")
    print(f"You can now run 'python run_kd_pipeline.py'")


if __name__ == "__main__":
    main()