-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_preprocessing.py
More file actions
61 lines (47 loc) · 2.18 KB
/
run_preprocessing.py
File metadata and controls
61 lines (47 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
Data Preprocessing Pipeline -
* Checks/Downloads raw data from Google Drive.
* Reads, normalizes, and windows the data.
* Generates subject-dependent splits.
* Saves processed artifacts to the location defined in config.py.
"""
import os
import numpy as np
from config import DATA_DIR, TIMESTEPS, OVERLAP_SAMPLES
from data_pipeline.downloader import download_and_extract_data
from data_pipeline.processor import load_raw_csvs, preprocess_features, generate_splits
# Define where raw data downloads to (can be temporary or permanent)
# We assume the zip extracts into a folder structure.
# Adjust 'RAW_DATA_ROOT' if you want it elsewhere.
RAW_DATA_ROOT = DATA_DIR
def main():
print("Starting Data Preprocessing Pipeline...")
print("\n[Step 1] Checking Raw Data Source...")
download_and_extract_data(RAW_DATA_ROOT)
print("\n[Step 2] processing CSV files...")
# Note: load_raw_csvs recursively searches, so we point it to the downloaded root
raw_df = load_raw_csvs(RAW_DATA_ROOT)
print("\n[Step 3] Normalizing and Encoding...")
processed_df = preprocess_features(raw_df)
print(f"\n[Step 4] Generating Windows (Size: {TIMESTEPS}) and Splits...")
# This matches the 'subject dependent' logic from your original code
X_train, y_train, X_test, y_test = generate_splits(
processed_df,
train_ratio=0.8,
window_size=TIMESTEPS,
step_size=OVERLAP_SAMPLES
)
print(f" -> Train shape: {X_train.shape}")
print(f" -> Test shape: {X_test.shape}")
# We use DATA_DIR from config.py to ensure the training scripts find it
print(f"\n[Step 5] Saving datasets to {DATA_DIR}...")
os.makedirs(DATA_DIR, exist_ok=True)
# Save as float32 for consistency with TFLite pipeline later
np.savetxt(os.path.join(DATA_DIR, "X_train.txt"), X_train, fmt='%.6f')
np.savetxt(os.path.join(DATA_DIR, "y_train.txt"), y_train, fmt='%d')
np.savetxt(os.path.join(DATA_DIR, "X_test.txt"), X_test, fmt='%.6f')
np.savetxt(os.path.join(DATA_DIR, "y_test.txt"), y_test, fmt='%d')
print("\n=== Preprocessing Complete! ===")
print(f"You can now run 'python run_kd_pipeline.py'")
if __name__ == "__main__":
main()