recodehive · sanjay-kv · Jan 2, 2026 · Dec 31, 2025 · Dec 31, 2025
diff --git a/projects/prediction/Smoking Prediction/__init__.py b/projects/prediction/Smoking Prediction/__init__.py
@@ -0,0 +1,3 @@
+"""
+Components module initialization
+"""
diff --git a/projects/prediction/Smoking Prediction/data_ingestion.py b/projects/prediction/Smoking Prediction/data_ingestion.py
@@ -0,0 +1,58 @@
+
+#? STAGE 1: DATA INGESTION
+
+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+pd.set_option('display.max_columns', None)
+
+class DataIngestion:
+    def __init__(self,dataset_paths):
+        """
+        dataset_paths: dictionary containing dataset paths as keys and their paths as values
+        Example: 
+        {
+            "dataset1": {"train": "path/to/dataset1_train.csv", "test": "path/to/dataset1_test.csv"}
+            "dataset2": {"train": "path/to/dataset2_train.csv", "test": "path/to/dataset2_test.csv"}
+        }
+        """
+        self.dataset_paths = dataset_paths
+
+    def load_data(self):
+        datasets = {}
+        for dataset_name, paths in self.dataset_paths.items():
+            # Load training data
+            train_df = pd.read_csv(paths["train"])
+
+            # Split into train and test
+            train_data, test_data = train_test_split(
+                train_df, test_size=0.2, random_state=42
+            )
+
+            # Store in nested structure
+            datasets[dataset_name] = {
+                "train": train_data,
+                "test": test_data
+            }
+
+        return datasets
+
+dataset_paths = {
+    "ml-olympiad-smoking": {
+        "train": "Y:/SmokingML V2/data/raw/ml-olympiad-smoking/train.csv"
+    },
+    "archive": {
+        "train": "Y:/SmokingML V2/data/raw/archive/train_dataset.csv"
+    }
+}
+
+# Create data ingestion object and load data
+data_ingestion = DataIngestion(dataset_paths)
+datasets = data_ingestion.load_data()
+
+# Now we can safely access the train/test splits
+print("ML Olympiad Training Data Type:", type(datasets["ml-olympiad-smoking"]["train"]))
+print("ML Olympiad Training Data Shape:", datasets["ml-olympiad-smoking"]["train"].shape)
+print("Archive Training Data Type:", type(datasets["archive"]["train"]))
+print("Archive Training Data Shape:", datasets["archive"]["train"].shape)
diff --git a/projects/prediction/Smoking Prediction/data_preprocessing.py b/projects/prediction/Smoking Prediction/data_preprocessing.py
@@ -0,0 +1,224 @@
+
+#? STAGE 2: DATA PREPROCESSING
+
+#* Importing dependencies
+import pandas as pd 
+import numpy as np
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
+from sklearn.decomposition import PCA
+from sklearn.model_selection import train_test_split
+import seaborn as sns
+import matplotlib.pyplot as plt
+import os
+from src.components.data_ingestion import datasets
+
+
+#* Define Preprocessing Function
+def preprocess_data(train_df, test_df):
+    # Store target variable
+    train_target = train_df['smoking']
+    test_target = test_df['smoking']
+
+    # Remove target from features
+    train_features = train_df.drop('smoking', axis=1)
+    test_features = test_df.drop('smoking', axis=1)
+
+    # Get numeric columns excluding target
+    num_cols = train_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
+
+    # Handle missing values for numeric columns
+    imputer = SimpleImputer(strategy='mean')
+    train_features[num_cols] = imputer.fit_transform(train_features[num_cols])
+    test_features[num_cols] = imputer.transform(test_features[num_cols])
+
+    # Handle categorical values
+    cat_cols = train_features.select_dtypes(include=['object']).columns
+    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
+
+    # Encode categorical columns
+    if len(cat_cols) > 0:
+        train_encoded = pd.DataFrame(
+            encoder.fit_transform(train_features[cat_cols]),
+            index=train_features.index,
+            columns=encoder.get_feature_names_out(cat_cols)
+        )
+        test_encoded = pd.DataFrame(
+            encoder.transform(test_features[cat_cols]),
+            index=test_features.index,
+            columns=encoder.get_feature_names_out(cat_cols)
+        )
+
+        # Drop original categorical columns and reset index
+        train_features = train_features.drop(cat_cols, axis=1)
+        test_features = test_features.drop(cat_cols, axis=1)
+
+        # Concatenate encoded features
+        train_features = pd.concat([train_features, train_encoded], axis=1)
+        test_features = pd.concat([test_features, test_encoded], axis=1)
+
+    # Feature Scaling - only scale numeric columns
+    scaler = StandardScaler()
+    train_features[num_cols] = scaler.fit_transform(train_features[num_cols])
+    test_features[num_cols] = scaler.transform(test_features[num_cols])
+
+    # Split features and target
+    X = train_features
+    y = train_target
+
+    # Split training data into train and validation sets
+    x_train, x_val, y_train, y_val = train_test_split(
+        X, y, 
+        test_size=0.2, 
+        random_state=42
+    )
+
+    # Store selected features
+    selected_features = x_train.columns.tolist()
+
+    # Return all 5 expected values
+    return x_train, x_val, y_train, y_val, selected_features
+
+
+def remove_low_variance_features(train_df, test_df, threshold=0.01):
+    train_target = train_df['smoking'] if 'smoking' in train_df.columns else None
+    train_features = train_df.drop('smoking', axis=1) if 'smoking' in train_df.columns else train_df
+
+    test_target = test_df['smoking'] if 'smoking' in test_df.columns else None
+    test_features = test_df.drop('smoking', axis=1) if 'smoking' in test_df.columns else test_df
+
+    selector = VarianceThreshold(threshold)
+    train_features_var = selector.fit_transform(train_features)
+    test_features_var = selector.transform(test_features)
+
+    selected_columns = train_features.columns[selector.get_support()]
+
+    train_selected = pd.DataFrame(train_features_var, columns=selected_columns, index=train_df.index)
+    test_selected = pd.DataFrame(test_features_var, columns=selected_columns, index=test_df.index)
+
+    if train_target is not None:
+        train_selected['smoking'] = train_target
+    if test_target is not None:
+        test_selected['smoking'] = test_target
+
+    return train_selected, test_selected
+
+
+def remove_highly_correlated_features(train_df, test_df, threshold=0.9):
+    correlation_matrix = train_df.corr()
+    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
+    drop_cols = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
+    return train_df.drop(columns=drop_cols), test_df.drop(columns=drop_cols)
+
+
+def select_features_by_mutual_info(train_df, test_df, target_column, num_features=15):
+    X = train_df.drop(columns=[target_column])
+    y = train_df[target_column]
+
+    mutual_info = mutual_info_classif(X, y, discrete_features='auto')
+    feature_scores = pd.Series(mutual_info, index=X.columns)
+    selected_features = feature_scores.nlargest(num_features).index.to_list()
+
+    if target_column in test_df.columns:
+        return train_df[selected_features + [target_column]], test_df[selected_features + [target_column]]
+    else:
+        return train_df[selected_features + [target_column]], test_df[selected_features]
+
+
+def apply_pca(train_df, test_df, n_components=10):
+    pca = PCA(n_components=n_components)
+    train_pca = pca.fit_transform(train_df)
+    test_pca = pca.transform(test_df)
+    return pd.DataFrame(train_pca), pd.DataFrame(test_pca)
+
+
+if __name__ == "__main__":
+    #* Load both Train and Test Datasets
+    train_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["train"])
+    test_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["test"])
+    train_archive = pd.DataFrame(datasets["archive"]["train"])
+    test_archive = pd.DataFrame(datasets["archive"]["test"])
+
+    print("DISPLAY BASIC INFORMATION")
+    print("ML Olympiad Train Data Shape:", train_ml.shape)
+    print("ML Olympiad Test Data Shape:", test_ml.shape)
+    print(train_ml.head())
+    print("Archive Train Data Shape:", train_archive.shape)
+    print("Archive Test Data Shape:", test_archive.shape)
+    print(test_archive.head())
+
+    #* Apply Preprocessing to all datasets
+    x_train_ml, x_val_ml, y_train_ml, y_val_ml, selected_features_ml = preprocess_data(train_ml, test_ml)
+    x_train_archive, x_val_archive, y_train_archive, y_val_archive, selected_features_archive = preprocess_data(train_archive, test_archive)
+
+    preprocessed_data_paths = {
+        "ml-olympiad-smoking": {
+            "train": "Y:/SmokingML V2/data/processed/ml_olympiad_train.csv",
+            "test": "Y:/SmokingML V2/data/processed/ml_olympiad_test.csv"
+        },
+        "archive": {
+            "train": "Y:/SmokingML V2/data/processed/archive_train.csv",
+            "test": "Y:/SmokingML V2/data/processed/archive_test.csv"
+        }
+    }
+
+    for dataset_name, paths in preprocessed_data_paths.items():
+        for key, path in paths.items():
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+
+    pd.concat([x_train_ml, y_train_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["train"], index=False)
+    pd.concat([x_val_ml, y_val_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["test"], index=False)
+    pd.concat([x_train_archive, y_train_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["train"], index=False)
+    pd.concat([x_val_archive, y_val_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["test"], index=False)
+
+    print("Preprocessed data has been saved successfully!")
+
+    #* Variance Thresholding
+    preprocessed_train_ml, preprocessed_test_ml = remove_low_variance_features(pd.concat([x_train_ml, y_train_ml], axis=1), pd.concat([x_val_ml, y_val_ml], axis=1))
+    preprocessed_train_archive, preprocessed_test_archive = remove_low_variance_features(pd.concat([x_train_archive, y_train_archive], axis=1), pd.concat([x_val_archive, y_val_archive], axis=1))
+
+    #* Feature Selection
+    preprocessed_train_ml, preprocessed_test_ml = select_features_by_mutual_info(preprocessed_train_ml, preprocessed_test_ml, target_column='smoking')
+    preprocessed_train_archive, preprocessed_test_archive = select_features_by_mutual_info(preprocessed_train_archive, preprocessed_test_archive, target_column='smoking')
+
+    #* ✅ Optional assertion checks
+    assert 'smoking' in preprocessed_train_ml.columns, "Target column 'smoking' missing in training set!"
+    assert 'smoking' in preprocessed_test_ml.columns, "Target column 'smoking' missing in test set!"
+    assert 'smoking' in preprocessed_train_archive.columns, "Target column 'smoking' missing in archive training set!"
+    assert 'smoking' in preprocessed_test_archive.columns, "Target column 'smoking' missing in archive test set!"
+
+    #* ✅ Debug: Show absolute save paths
+    print("\n✅ Saving preprocessed files to:")
+    print("ML Train Path      :", os.path.abspath("Y:/SmokingML V2/data/processed/train_ml.csv"))
+    print("ML Test Path       :", os.path.abspath("Y:/SmokingML V2/data/processed/test_ml.csv"))
+    print("Archive Train Path :", os.path.abspath("Y:/SmokingML V2/data/processed/train_archive.csv"))
+    print("Archive Test Path  :", os.path.abspath("Y:/SmokingML V2/data/processed/test_archive.csv"))
+
+    #* Save final preprocessed files
+    preprocessed_train_ml.to_csv("Y:/SmokingML V2/data/processed/train_ml.csv", index=False)
+    preprocessed_test_ml.to_csv("Y:/SmokingML V2/data/processed/test_ml.csv", index=False)
+    preprocessed_train_archive.to_csv("Y:/SmokingML V2/data/processed/train_archive.csv", index=False)
+    preprocessed_test_archive.to_csv("Y:/SmokingML V2/data/processed/test_archive.csv", index=False)
+
+    print("Feature Engineering and Selection completed Successfully!")
+
+
+    import json
+
+    #* Save selected features to JSON for both datasets
+    selected_features_dir = "Y:/SmokingML V2/artifacts/models"
+    os.makedirs(selected_features_dir, exist_ok=True)
+
+    # Remove 'smoking' from selected columns before saving (optional based on use-case)
+    selected_columns_olympiad = [col for col in preprocessed_train_ml.columns if col != 'smoking']
+    selected_columns_archive = [col for col in preprocessed_train_archive.columns if col != 'smoking']
+
+    # Save to JSON
+    with open(os.path.join(selected_features_dir, "feature_columns_olympiad.json"), "w") as f:
+        json.dump(selected_columns_olympiad, f, indent=4)
+
+    with open(os.path.join(selected_features_dir, "feature_columns_archive.json"), "w") as f:
+        json.dump(selected_columns_archive, f, indent=4)
+
+    print("✅ Feature columns JSON files saved successfully!")
diff --git a/projects/prediction/Smoking Prediction/feature_engineering.py b/projects/prediction/Smoking Prediction/feature_engineering.py
@@ -0,0 +1,106 @@
+
+#? STAGE 3: FEATURE ENGINEERING
+
+import json
+import numpy as np
+import pandas as pd
+from typing import Dict, List
+from sklearn.preprocessing import PolynomialFeatures
+from pathlib import Path
+
+class FeatureEngineer:
+    def __init__(self):
+        self.rules = self._load_rules()
+
+    def _load_rules(self) -> Dict:
+        """Load feature engineering rules from config file"""
+        config_path = Path("config/feature_engineering_rules.json")
+        if not config_path.exists():
+            return self._get_default_rules()
+
+        with open(config_path, "r") as f:
+            return json.load(f)
+
+    def _get_default_rules(self) -> Dict:
+        """Default feature engineering rules if no config exists"""
+        return {
+            "health_indicators": [
+                {
+                    "name": "bmi_health_index",
+                    "formula": "weight / (height ** 2)",
+                    "enabled": True,
+                    "description": "BMI-based health indicator"
+                }
+            ],
+            "polynomial_features": ["age", "weight", "height"],
+            "feature_ratios": [
+                {
+                    "name": "age_bmi_ratio",
+                    "formula": "age / bmi_health_index",
+                    "enabled": True
+                }
+            ],
+            "polynomial_degree": 2
+        }
+
+    def create_health_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Generate health indicator features based on configured rules"""
+        result = df.copy()
+
+        for rule in self.rules["health_indicators"]:
+            if rule["enabled"]:
+                try:
+                    result[rule["name"]] = eval(rule["formula"], 
+                                              {"__builtins__": None}, 
+                                              {**dict(result), "np": np})
+                except Exception as e:
+                    print(f"Failed to calculate {rule['name']}: {str(e)}")
+
+        return result
+
+    def create_polynomial_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Generate polynomial features for specified columns"""
+        result = df.copy()
+        features_to_transform = [col for col in self.rules["polynomial_features"] 
+                               if col in df.columns]
+
+        if not features_to_transform:
+            return result
+
+        poly = PolynomialFeatures(
+            degree=self.rules["polynomial_degree"],
+            include_bias=False
+        )
+
+        poly_features = poly.fit_transform(df[features_to_transform])
+        feature_names = poly.get_feature_names_out(features_to_transform)
+
+        # Add only the interaction terms and higher degree terms
+        for i, name in enumerate(feature_names[len(features_to_transform):], 
+                               start=len(features_to_transform)):
+            result[f"poly_{name}"] = poly_features[:, i]
+
+        return result
+
+    def create_feature_ratios(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Generate feature ratios based on configured rules"""
+        result = df.copy()
+
+        for rule in self.rules["feature_ratios"]:
+            if rule["enabled"]:
+                try:
+                    result[rule["name"]] = eval(rule["formula"], 
+                                              {"__builtins__": None}, 
+                                              {**dict(result), "np": np})
+                except Exception as e:
+                    print(f"Failed to calculate {rule['name']}: {str(e)}")
+
+        return result
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Apply all feature engineering transformations"""
+        result = df.copy()
+        result = self.create_health_indicators(result)
+        result = self.create_polynomial_features(result)
+        result = self.create_feature_ratios(result)
+        return result