Skip to content

Commit e25bed6

Browse files
authored
Merge pull request #1702 from tvnisxq/smoking-prediction-project
Smoking prediction project
2 parents e8b089b + 326acc2 commit e25bed6

10 files changed

Lines changed: 2083 additions & 1 deletion
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""
2+
Components module initialization
3+
"""
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
2+
#? STAGE 1: DATA INGESTION
3+
4+
import os
5+
import pandas as pd
6+
from sklearn.model_selection import train_test_split
7+
8+
pd.set_option('display.max_columns', None)
9+
10+
class DataIngestion:
11+
def __init__(self,dataset_paths):
12+
"""
13+
dataset_paths: dictionary containing dataset paths as keys and their paths as values
14+
Example:
15+
{
16+
"dataset1": {"train": "path/to/dataset1_train.csv", "test": "path/to/dataset1_test.csv"}
17+
"dataset2": {"train": "path/to/dataset2_train.csv", "test": "path/to/dataset2_test.csv"}
18+
}
19+
"""
20+
self.dataset_paths = dataset_paths
21+
22+
def load_data(self):
23+
datasets = {}
24+
for dataset_name, paths in self.dataset_paths.items():
25+
# Load training data
26+
train_df = pd.read_csv(paths["train"])
27+
28+
# Split into train and test
29+
train_data, test_data = train_test_split(
30+
train_df, test_size=0.2, random_state=42
31+
)
32+
33+
# Store in nested structure
34+
datasets[dataset_name] = {
35+
"train": train_data,
36+
"test": test_data
37+
}
38+
39+
return datasets
40+
41+
dataset_paths = {
42+
"ml-olympiad-smoking": {
43+
"train": "Y:/SmokingML V2/data/raw/ml-olympiad-smoking/train.csv"
44+
},
45+
"archive": {
46+
"train": "Y:/SmokingML V2/data/raw/archive/train_dataset.csv"
47+
}
48+
}
49+
50+
# Create data ingestion object and load data
51+
data_ingestion = DataIngestion(dataset_paths)
52+
datasets = data_ingestion.load_data()
53+
54+
# Now we can safely access the train/test splits
55+
print("ML Olympiad Training Data Type:", type(datasets["ml-olympiad-smoking"]["train"]))
56+
print("ML Olympiad Training Data Shape:", datasets["ml-olympiad-smoking"]["train"].shape)
57+
print("Archive Training Data Type:", type(datasets["archive"]["train"]))
58+
print("Archive Training Data Shape:", datasets["archive"]["train"].shape)
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
2+
#? STAGE 2: DATA PREPROCESSING
3+
4+
#* Importing dependencies
5+
import pandas as pd
6+
import numpy as np
7+
from sklearn.impute import SimpleImputer
8+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
9+
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
10+
from sklearn.decomposition import PCA
11+
from sklearn.model_selection import train_test_split
12+
import seaborn as sns
13+
import matplotlib.pyplot as plt
14+
import os
15+
from src.components.data_ingestion import datasets
16+
17+
18+
#* Define Preprocessing Function
19+
def preprocess_data(train_df, test_df):
20+
# Store target variable
21+
train_target = train_df['smoking']
22+
test_target = test_df['smoking']
23+
24+
# Remove target from features
25+
train_features = train_df.drop('smoking', axis=1)
26+
test_features = test_df.drop('smoking', axis=1)
27+
28+
# Get numeric columns excluding target
29+
num_cols = train_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
30+
31+
# Handle missing values for numeric columns
32+
imputer = SimpleImputer(strategy='mean')
33+
train_features[num_cols] = imputer.fit_transform(train_features[num_cols])
34+
test_features[num_cols] = imputer.transform(test_features[num_cols])
35+
36+
# Handle categorical values
37+
cat_cols = train_features.select_dtypes(include=['object']).columns
38+
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
39+
40+
# Encode categorical columns
41+
if len(cat_cols) > 0:
42+
train_encoded = pd.DataFrame(
43+
encoder.fit_transform(train_features[cat_cols]),
44+
index=train_features.index,
45+
columns=encoder.get_feature_names_out(cat_cols)
46+
)
47+
test_encoded = pd.DataFrame(
48+
encoder.transform(test_features[cat_cols]),
49+
index=test_features.index,
50+
columns=encoder.get_feature_names_out(cat_cols)
51+
)
52+
53+
# Drop original categorical columns and reset index
54+
train_features = train_features.drop(cat_cols, axis=1)
55+
test_features = test_features.drop(cat_cols, axis=1)
56+
57+
# Concatenate encoded features
58+
train_features = pd.concat([train_features, train_encoded], axis=1)
59+
test_features = pd.concat([test_features, test_encoded], axis=1)
60+
61+
# Feature Scaling - only scale numeric columns
62+
scaler = StandardScaler()
63+
train_features[num_cols] = scaler.fit_transform(train_features[num_cols])
64+
test_features[num_cols] = scaler.transform(test_features[num_cols])
65+
66+
# Split features and target
67+
X = train_features
68+
y = train_target
69+
70+
# Split training data into train and validation sets
71+
x_train, x_val, y_train, y_val = train_test_split(
72+
X, y,
73+
test_size=0.2,
74+
random_state=42
75+
)
76+
77+
# Store selected features
78+
selected_features = x_train.columns.tolist()
79+
80+
# Return all 5 expected values
81+
return x_train, x_val, y_train, y_val, selected_features
82+
83+
84+
def remove_low_variance_features(train_df, test_df, threshold=0.01):
85+
train_target = train_df['smoking'] if 'smoking' in train_df.columns else None
86+
train_features = train_df.drop('smoking', axis=1) if 'smoking' in train_df.columns else train_df
87+
88+
test_target = test_df['smoking'] if 'smoking' in test_df.columns else None
89+
test_features = test_df.drop('smoking', axis=1) if 'smoking' in test_df.columns else test_df
90+
91+
selector = VarianceThreshold(threshold)
92+
train_features_var = selector.fit_transform(train_features)
93+
test_features_var = selector.transform(test_features)
94+
95+
selected_columns = train_features.columns[selector.get_support()]
96+
97+
train_selected = pd.DataFrame(train_features_var, columns=selected_columns, index=train_df.index)
98+
test_selected = pd.DataFrame(test_features_var, columns=selected_columns, index=test_df.index)
99+
100+
if train_target is not None:
101+
train_selected['smoking'] = train_target
102+
if test_target is not None:
103+
test_selected['smoking'] = test_target
104+
105+
return train_selected, test_selected
106+
107+
108+
def remove_highly_correlated_features(train_df, test_df, threshold=0.9):
109+
correlation_matrix = train_df.corr()
110+
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
111+
drop_cols = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
112+
return train_df.drop(columns=drop_cols), test_df.drop(columns=drop_cols)
113+
114+
115+
def select_features_by_mutual_info(train_df, test_df, target_column, num_features=15):
116+
X = train_df.drop(columns=[target_column])
117+
y = train_df[target_column]
118+
119+
mutual_info = mutual_info_classif(X, y, discrete_features='auto')
120+
feature_scores = pd.Series(mutual_info, index=X.columns)
121+
selected_features = feature_scores.nlargest(num_features).index.to_list()
122+
123+
if target_column in test_df.columns:
124+
return train_df[selected_features + [target_column]], test_df[selected_features + [target_column]]
125+
else:
126+
return train_df[selected_features + [target_column]], test_df[selected_features]
127+
128+
129+
def apply_pca(train_df, test_df, n_components=10):
130+
pca = PCA(n_components=n_components)
131+
train_pca = pca.fit_transform(train_df)
132+
test_pca = pca.transform(test_df)
133+
return pd.DataFrame(train_pca), pd.DataFrame(test_pca)
134+
135+
136+
if __name__ == "__main__":
137+
#* Load both Train and Test Datasets
138+
train_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["train"])
139+
test_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["test"])
140+
train_archive = pd.DataFrame(datasets["archive"]["train"])
141+
test_archive = pd.DataFrame(datasets["archive"]["test"])
142+
143+
print("DISPLAY BASIC INFORMATION")
144+
print("ML Olympiad Train Data Shape:", train_ml.shape)
145+
print("ML Olympiad Test Data Shape:", test_ml.shape)
146+
print(train_ml.head())
147+
print("Archive Train Data Shape:", train_archive.shape)
148+
print("Archive Test Data Shape:", test_archive.shape)
149+
print(test_archive.head())
150+
151+
#* Apply Preprocessing to all datasets
152+
x_train_ml, x_val_ml, y_train_ml, y_val_ml, selected_features_ml = preprocess_data(train_ml, test_ml)
153+
x_train_archive, x_val_archive, y_train_archive, y_val_archive, selected_features_archive = preprocess_data(train_archive, test_archive)
154+
155+
preprocessed_data_paths = {
156+
"ml-olympiad-smoking": {
157+
"train": "Y:/SmokingML V2/data/processed/ml_olympiad_train.csv",
158+
"test": "Y:/SmokingML V2/data/processed/ml_olympiad_test.csv"
159+
},
160+
"archive": {
161+
"train": "Y:/SmokingML V2/data/processed/archive_train.csv",
162+
"test": "Y:/SmokingML V2/data/processed/archive_test.csv"
163+
}
164+
}
165+
166+
for dataset_name, paths in preprocessed_data_paths.items():
167+
for key, path in paths.items():
168+
os.makedirs(os.path.dirname(path), exist_ok=True)
169+
170+
pd.concat([x_train_ml, y_train_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["train"], index=False)
171+
pd.concat([x_val_ml, y_val_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["test"], index=False)
172+
pd.concat([x_train_archive, y_train_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["train"], index=False)
173+
pd.concat([x_val_archive, y_val_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["test"], index=False)
174+
175+
print("Preprocessed data has been saved successfully!")
176+
177+
#* Variance Thresholding
178+
preprocessed_train_ml, preprocessed_test_ml = remove_low_variance_features(pd.concat([x_train_ml, y_train_ml], axis=1), pd.concat([x_val_ml, y_val_ml], axis=1))
179+
preprocessed_train_archive, preprocessed_test_archive = remove_low_variance_features(pd.concat([x_train_archive, y_train_archive], axis=1), pd.concat([x_val_archive, y_val_archive], axis=1))
180+
181+
#* Feature Selection
182+
preprocessed_train_ml, preprocessed_test_ml = select_features_by_mutual_info(preprocessed_train_ml, preprocessed_test_ml, target_column='smoking')
183+
preprocessed_train_archive, preprocessed_test_archive = select_features_by_mutual_info(preprocessed_train_archive, preprocessed_test_archive, target_column='smoking')
184+
185+
#* ✅ Optional assertion checks
186+
assert 'smoking' in preprocessed_train_ml.columns, "Target column 'smoking' missing in training set!"
187+
assert 'smoking' in preprocessed_test_ml.columns, "Target column 'smoking' missing in test set!"
188+
assert 'smoking' in preprocessed_train_archive.columns, "Target column 'smoking' missing in archive training set!"
189+
assert 'smoking' in preprocessed_test_archive.columns, "Target column 'smoking' missing in archive test set!"
190+
191+
#* ✅ Debug: Show absolute save paths
192+
print("\n✅ Saving preprocessed files to:")
193+
print("ML Train Path :", os.path.abspath("Y:/SmokingML V2/data/processed/train_ml.csv"))
194+
print("ML Test Path :", os.path.abspath("Y:/SmokingML V2/data/processed/test_ml.csv"))
195+
print("Archive Train Path :", os.path.abspath("Y:/SmokingML V2/data/processed/train_archive.csv"))
196+
print("Archive Test Path :", os.path.abspath("Y:/SmokingML V2/data/processed/test_archive.csv"))
197+
198+
#* Save final preprocessed files
199+
preprocessed_train_ml.to_csv("Y:/SmokingML V2/data/processed/train_ml.csv", index=False)
200+
preprocessed_test_ml.to_csv("Y:/SmokingML V2/data/processed/test_ml.csv", index=False)
201+
preprocessed_train_archive.to_csv("Y:/SmokingML V2/data/processed/train_archive.csv", index=False)
202+
preprocessed_test_archive.to_csv("Y:/SmokingML V2/data/processed/test_archive.csv", index=False)
203+
204+
print("Feature Engineering and Selection completed Successfully!")
205+
206+
207+
import json
208+
209+
#* Save selected features to JSON for both datasets
210+
selected_features_dir = "Y:/SmokingML V2/artifacts/models"
211+
os.makedirs(selected_features_dir, exist_ok=True)
212+
213+
# Remove 'smoking' from selected columns before saving (optional based on use-case)
214+
selected_columns_olympiad = [col for col in preprocessed_train_ml.columns if col != 'smoking']
215+
selected_columns_archive = [col for col in preprocessed_train_archive.columns if col != 'smoking']
216+
217+
# Save to JSON
218+
with open(os.path.join(selected_features_dir, "feature_columns_olympiad.json"), "w") as f:
219+
json.dump(selected_columns_olympiad, f, indent=4)
220+
221+
with open(os.path.join(selected_features_dir, "feature_columns_archive.json"), "w") as f:
222+
json.dump(selected_columns_archive, f, indent=4)
223+
224+
print("✅ Feature columns JSON files saved successfully!")
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
2+
#? STAGE 3: FEATURE ENGINEERING
3+
4+
import json
5+
import numpy as np
6+
import pandas as pd
7+
from typing import Dict, List
8+
from sklearn.preprocessing import PolynomialFeatures
9+
from pathlib import Path
10+
11+
class FeatureEngineer:
12+
def __init__(self):
13+
self.rules = self._load_rules()
14+
15+
def _load_rules(self) -> Dict:
16+
"""Load feature engineering rules from config file"""
17+
config_path = Path("config/feature_engineering_rules.json")
18+
if not config_path.exists():
19+
return self._get_default_rules()
20+
21+
with open(config_path, "r") as f:
22+
return json.load(f)
23+
24+
def _get_default_rules(self) -> Dict:
25+
"""Default feature engineering rules if no config exists"""
26+
return {
27+
"health_indicators": [
28+
{
29+
"name": "bmi_health_index",
30+
"formula": "weight / (height ** 2)",
31+
"enabled": True,
32+
"description": "BMI-based health indicator"
33+
}
34+
],
35+
"polynomial_features": ["age", "weight", "height"],
36+
"feature_ratios": [
37+
{
38+
"name": "age_bmi_ratio",
39+
"formula": "age / bmi_health_index",
40+
"enabled": True
41+
}
42+
],
43+
"polynomial_degree": 2
44+
}
45+
46+
def create_health_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
47+
"""Generate health indicator features based on configured rules"""
48+
result = df.copy()
49+
50+
for rule in self.rules["health_indicators"]:
51+
if rule["enabled"]:
52+
try:
53+
result[rule["name"]] = eval(rule["formula"],
54+
{"__builtins__": None},
55+
{**dict(result), "np": np})
56+
except Exception as e:
57+
print(f"Failed to calculate {rule['name']}: {str(e)}")
58+
59+
return result
60+
61+
def create_polynomial_features(self, df: pd.DataFrame) -> pd.DataFrame:
62+
"""Generate polynomial features for specified columns"""
63+
result = df.copy()
64+
features_to_transform = [col for col in self.rules["polynomial_features"]
65+
if col in df.columns]
66+
67+
if not features_to_transform:
68+
return result
69+
70+
poly = PolynomialFeatures(
71+
degree=self.rules["polynomial_degree"],
72+
include_bias=False
73+
)
74+
75+
poly_features = poly.fit_transform(df[features_to_transform])
76+
feature_names = poly.get_feature_names_out(features_to_transform)
77+
78+
# Add only the interaction terms and higher degree terms
79+
for i, name in enumerate(feature_names[len(features_to_transform):],
80+
start=len(features_to_transform)):
81+
result[f"poly_{name}"] = poly_features[:, i]
82+
83+
return result
84+
85+
def create_feature_ratios(self, df: pd.DataFrame) -> pd.DataFrame:
86+
"""Generate feature ratios based on configured rules"""
87+
result = df.copy()
88+
89+
for rule in self.rules["feature_ratios"]:
90+
if rule["enabled"]:
91+
try:
92+
result[rule["name"]] = eval(rule["formula"],
93+
{"__builtins__": None},
94+
{**dict(result), "np": np})
95+
except Exception as e:
96+
print(f"Failed to calculate {rule['name']}: {str(e)}")
97+
98+
return result
99+
100+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
101+
"""Apply all feature engineering transformations"""
102+
result = df.copy()
103+
result = self.create_health_indicators(result)
104+
result = self.create_polynomial_features(result)
105+
result = self.create_feature_ratios(result)
106+
return result

0 commit comments

Comments
 (0)