diff --git a/projects/prediction/Smoking Prediction/__init__.py b/projects/prediction/Smoking Prediction/__init__.py
new file mode 100644
index 000000000..3f0d20d25
--- /dev/null
+++ b/projects/prediction/Smoking Prediction/__init__.py	
@@ -0,0 +1,3 @@
+"""
+Components module initialization
+"""
\ No newline at end of file
diff --git a/projects/prediction/Smoking Prediction/data_ingestion.py b/projects/prediction/Smoking Prediction/data_ingestion.py
new file mode 100644
index 000000000..cf6ef6318
--- /dev/null
+++ b/projects/prediction/Smoking Prediction/data_ingestion.py	
@@ -0,0 +1,58 @@
+
+#? STAGE 1: DATA INGESTION
+
+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+pd.set_option('display.max_columns', None)
+
+class DataIngestion:
+    def __init__(self,dataset_paths):
+        """
+        dataset_paths: dictionary containing dataset paths as keys and their paths as values
+        Example: 
+        {
+            "dataset1": {"train": "path/to/dataset1_train.csv", "test": "path/to/dataset1_test.csv"}
+            "dataset2": {"train": "path/to/dataset2_train.csv", "test": "path/to/dataset2_test.csv"}
+        }
+        """
+        self.dataset_paths = dataset_paths
+
+    def load_data(self):
+        datasets = {}
+        for dataset_name, paths in self.dataset_paths.items():
+            # Load training data
+            train_df = pd.read_csv(paths["train"])
+            
+            # Split into train and test
+            train_data, test_data = train_test_split(
+                train_df, test_size=0.2, random_state=42
+            )
+            
+            # Store in nested structure
+            datasets[dataset_name] = {
+                "train": train_data,
+                "test": test_data
+            }
+        
+        return datasets
+
+dataset_paths = {
+    "ml-olympiad-smoking": {
+        "train": "Y:/SmokingML V2/data/raw/ml-olympiad-smoking/train.csv"
+    },
+    "archive": {
+        "train": "Y:/SmokingML V2/data/raw/archive/train_dataset.csv"
+    }
+}
+
+# Create data ingestion object and load data
+data_ingestion = DataIngestion(dataset_paths)
+datasets = data_ingestion.load_data()
+
+# Now we can safely access the train/test splits
+print("ML Olympiad Training Data Type:", type(datasets["ml-olympiad-smoking"]["train"]))
+print("ML Olympiad Training Data Shape:", datasets["ml-olympiad-smoking"]["train"].shape)
+print("Archive Training Data Type:", type(datasets["archive"]["train"]))
+print("Archive Training Data Shape:", datasets["archive"]["train"].shape)
diff --git a/projects/prediction/Smoking Prediction/data_preprocessing.py b/projects/prediction/Smoking Prediction/data_preprocessing.py
new file mode 100644
index 000000000..9894b0674
--- /dev/null
+++ b/projects/prediction/Smoking Prediction/data_preprocessing.py	
@@ -0,0 +1,224 @@
+
+#? STAGE 2: DATA PREPROCESSING
+
+#* Importing dependencies
+import pandas as pd 
+import numpy as np
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
+from sklearn.decomposition import PCA
+from sklearn.model_selection import train_test_split
+import seaborn as sns
+import matplotlib.pyplot as plt
+import os
+from src.components.data_ingestion import datasets
+
+
+#* Define Preprocessing Function
+def preprocess_data(train_df, test_df):
+    # Store target variable
+    train_target = train_df['smoking']
+    test_target = test_df['smoking']
+    
+    # Remove target from features
+    train_features = train_df.drop('smoking', axis=1)
+    test_features = test_df.drop('smoking', axis=1)
+    
+    # Get numeric columns excluding target
+    num_cols = train_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
+    
+    # Handle missing values for numeric columns
+    imputer = SimpleImputer(strategy='mean')
+    train_features[num_cols] = imputer.fit_transform(train_features[num_cols])
+    test_features[num_cols] = imputer.transform(test_features[num_cols])
+
+    # Handle categorical values
+    cat_cols = train_features.select_dtypes(include=['object']).columns
+    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
+
+    # Encode categorical columns
+    if len(cat_cols) > 0:
+        train_encoded = pd.DataFrame(
+            encoder.fit_transform(train_features[cat_cols]),
+            index=train_features.index,
+            columns=encoder.get_feature_names_out(cat_cols)
+        )
+        test_encoded = pd.DataFrame(
+            encoder.transform(test_features[cat_cols]),
+            index=test_features.index,
+            columns=encoder.get_feature_names_out(cat_cols)
+        )
+        
+        # Drop original categorical columns and reset index
+        train_features = train_features.drop(cat_cols, axis=1)
+        test_features = test_features.drop(cat_cols, axis=1)
+        
+        # Concatenate encoded features
+        train_features = pd.concat([train_features, train_encoded], axis=1)
+        test_features = pd.concat([test_features, test_encoded], axis=1)
+
+    # Feature Scaling - only scale numeric columns
+    scaler = StandardScaler()
+    train_features[num_cols] = scaler.fit_transform(train_features[num_cols])
+    test_features[num_cols] = scaler.transform(test_features[num_cols])
+
+    # Split features and target
+    X = train_features
+    y = train_target
+
+    # Split training data into train and validation sets
+    x_train, x_val, y_train, y_val = train_test_split(
+        X, y, 
+        test_size=0.2, 
+        random_state=42
+    )
+
+    # Store selected features
+    selected_features = x_train.columns.tolist()
+
+    # Return all 5 expected values
+    return x_train, x_val, y_train, y_val, selected_features
+
+
+def remove_low_variance_features(train_df, test_df, threshold=0.01):
+    train_target = train_df['smoking'] if 'smoking' in train_df.columns else None
+    train_features = train_df.drop('smoking', axis=1) if 'smoking' in train_df.columns else train_df
+    
+    test_target = test_df['smoking'] if 'smoking' in test_df.columns else None
+    test_features = test_df.drop('smoking', axis=1) if 'smoking' in test_df.columns else test_df
+    
+    selector = VarianceThreshold(threshold)
+    train_features_var = selector.fit_transform(train_features)
+    test_features_var = selector.transform(test_features)
+
+    selected_columns = train_features.columns[selector.get_support()]
+    
+    train_selected = pd.DataFrame(train_features_var, columns=selected_columns, index=train_df.index)
+    test_selected = pd.DataFrame(test_features_var, columns=selected_columns, index=test_df.index)
+    
+    if train_target is not None:
+        train_selected['smoking'] = train_target
+    if test_target is not None:
+        test_selected['smoking'] = test_target
+    
+    return train_selected, test_selected
+
+
+def remove_highly_correlated_features(train_df, test_df, threshold=0.9):
+    correlation_matrix = train_df.corr()
+    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
+    drop_cols = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
+    return train_df.drop(columns=drop_cols), test_df.drop(columns=drop_cols)
+
+
+def select_features_by_mutual_info(train_df, test_df, target_column, num_features=15):
+    X = train_df.drop(columns=[target_column])
+    y = train_df[target_column]
+
+    mutual_info = mutual_info_classif(X, y, discrete_features='auto')
+    feature_scores = pd.Series(mutual_info, index=X.columns)
+    selected_features = feature_scores.nlargest(num_features).index.to_list()
+
+    if target_column in test_df.columns:
+        return train_df[selected_features + [target_column]], test_df[selected_features + [target_column]]
+    else:
+        return train_df[selected_features + [target_column]], test_df[selected_features]
+
+
+def apply_pca(train_df, test_df, n_components=10):
+    pca = PCA(n_components=n_components)
+    train_pca = pca.fit_transform(train_df)
+    test_pca = pca.transform(test_df)
+    return pd.DataFrame(train_pca), pd.DataFrame(test_pca)
+
+
+if __name__ == "__main__":
+    #* Load both Train and Test Datasets
+    train_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["train"])
+    test_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["test"])
+    train_archive = pd.DataFrame(datasets["archive"]["train"])
+    test_archive = pd.DataFrame(datasets["archive"]["test"])
+
+    print("DISPLAY BASIC INFORMATION")
+    print("ML Olympiad Train Data Shape:", train_ml.shape)
+    print("ML Olympiad Test Data Shape:", test_ml.shape)
+    print(train_ml.head())
+    print("Archive Train Data Shape:", train_archive.shape)
+    print("Archive Test Data Shape:", test_archive.shape)
+    print(test_archive.head())
+
+    #* Apply Preprocessing to all datasets
+    x_train_ml, x_val_ml, y_train_ml, y_val_ml, selected_features_ml = preprocess_data(train_ml, test_ml)
+    x_train_archive, x_val_archive, y_train_archive, y_val_archive, selected_features_archive = preprocess_data(train_archive, test_archive)
+
+    preprocessed_data_paths = {
+        "ml-olympiad-smoking": {
+            "train": "Y:/SmokingML V2/data/processed/ml_olympiad_train.csv",
+            "test": "Y:/SmokingML V2/data/processed/ml_olympiad_test.csv"
+        },
+        "archive": {
+            "train": "Y:/SmokingML V2/data/processed/archive_train.csv",
+            "test": "Y:/SmokingML V2/data/processed/archive_test.csv"
+        }
+    }
+
+    for dataset_name, paths in preprocessed_data_paths.items():
+        for key, path in paths.items():
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+
+    pd.concat([x_train_ml, y_train_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["train"], index=False)
+    pd.concat([x_val_ml, y_val_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["test"], index=False)
+    pd.concat([x_train_archive, y_train_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["train"], index=False)
+    pd.concat([x_val_archive, y_val_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["test"], index=False)
+
+    print("Preprocessed data has been saved successfully!")
+
+    #* Variance Thresholding
+    preprocessed_train_ml, preprocessed_test_ml = remove_low_variance_features(pd.concat([x_train_ml, y_train_ml], axis=1), pd.concat([x_val_ml, y_val_ml], axis=1))
+    preprocessed_train_archive, preprocessed_test_archive = remove_low_variance_features(pd.concat([x_train_archive, y_train_archive], axis=1), pd.concat([x_val_archive, y_val_archive], axis=1))
+
+    #* Feature Selection
+    preprocessed_train_ml, preprocessed_test_ml = select_features_by_mutual_info(preprocessed_train_ml, preprocessed_test_ml, target_column='smoking')
+    preprocessed_train_archive, preprocessed_test_archive = select_features_by_mutual_info(preprocessed_train_archive, preprocessed_test_archive, target_column='smoking')
+
+    #* ✅ Optional assertion checks
+    assert 'smoking' in preprocessed_train_ml.columns, "Target column 'smoking' missing in training set!"
+    assert 'smoking' in preprocessed_test_ml.columns, "Target column 'smoking' missing in test set!"
+    assert 'smoking' in preprocessed_train_archive.columns, "Target column 'smoking' missing in archive training set!"
+    assert 'smoking' in preprocessed_test_archive.columns, "Target column 'smoking' missing in archive test set!"
+
+    #* ✅ Debug: Show absolute save paths
+    print("\n✅ Saving preprocessed files to:")
+    print("ML Train Path      :", os.path.abspath("Y:/SmokingML V2/data/processed/train_ml.csv"))
+    print("ML Test Path       :", os.path.abspath("Y:/SmokingML V2/data/processed/test_ml.csv"))
+    print("Archive Train Path :", os.path.abspath("Y:/SmokingML V2/data/processed/train_archive.csv"))
+    print("Archive Test Path  :", os.path.abspath("Y:/SmokingML V2/data/processed/test_archive.csv"))
+
+    #* Save final preprocessed files
+    preprocessed_train_ml.to_csv("Y:/SmokingML V2/data/processed/train_ml.csv", index=False)
+    preprocessed_test_ml.to_csv("Y:/SmokingML V2/data/processed/test_ml.csv", index=False)
+    preprocessed_train_archive.to_csv("Y:/SmokingML V2/data/processed/train_archive.csv", index=False)
+    preprocessed_test_archive.to_csv("Y:/SmokingML V2/data/processed/test_archive.csv", index=False)
+
+    print("Feature Engineering and Selection completed Successfully!")
+
+
+    import json
+
+    #* Save selected features to JSON for both datasets
+    selected_features_dir = "Y:/SmokingML V2/artifacts/models"
+    os.makedirs(selected_features_dir, exist_ok=True)
+
+    # Remove 'smoking' from selected columns before saving (optional based on use-case)
+    selected_columns_olympiad = [col for col in preprocessed_train_ml.columns if col != 'smoking']
+    selected_columns_archive = [col for col in preprocessed_train_archive.columns if col != 'smoking']
+
+    # Save to JSON
+    with open(os.path.join(selected_features_dir, "feature_columns_olympiad.json"), "w") as f:
+        json.dump(selected_columns_olympiad, f, indent=4)
+
+    with open(os.path.join(selected_features_dir, "feature_columns_archive.json"), "w") as f:
+        json.dump(selected_columns_archive, f, indent=4)
+
+    print("✅ Feature columns JSON files saved successfully!")
diff --git a/projects/prediction/Smoking Prediction/feature_engineering.py b/projects/prediction/Smoking Prediction/feature_engineering.py
new file mode 100644
index 000000000..f71554373
--- /dev/null
+++ b/projects/prediction/Smoking Prediction/feature_engineering.py	
@@ -0,0 +1,106 @@
+
+#? STAGE 3: FEATURE ENGINEERING
+
+import json
+import numpy as np
+import pandas as pd
+from typing import Dict, List
+from sklearn.preprocessing import PolynomialFeatures
+from pathlib import Path
+
+class FeatureEngineer:
+    def __init__(self):
+        self.rules = self._load_rules()
+        
+    def _load_rules(self) -> Dict:
+        """Load feature engineering rules from config file"""
+        config_path = Path("config/feature_engineering_rules.json")
+        if not config_path.exists():
+            return self._get_default_rules()
+        
+        with open(config_path, "r") as f:
+            return json.load(f)
+    
+    def _get_default_rules(self) -> Dict:
+        """Default feature engineering rules if no config exists"""
+        return {
+            "health_indicators": [
+                {
+                    "name": "bmi_health_index",
+                    "formula": "weight / (height ** 2)",
+                    "enabled": True,
+                    "description": "BMI-based health indicator"
+                }
+            ],
+            "polynomial_features": ["age", "weight", "height"],
+            "feature_ratios": [
+                {
+                    "name": "age_bmi_ratio",
+                    "formula": "age / bmi_health_index",
+                    "enabled": True
+                }
+            ],
+            "polynomial_degree": 2
+        }
+    
+    def create_health_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Generate health indicator features based on configured rules"""
+        result = df.copy()
+        
+        for rule in self.rules["health_indicators"]:
+            if rule["enabled"]:
+                try:
+                    result[rule["name"]] = eval(rule["formula"], 
+                                              {"__builtins__": None}, 
+                                              {**dict(result), "np": np})
+                except Exception as e:
+                    print(f"Failed to calculate {rule['name']}: {str(e)}")
+        
+        return result
+    
+    def create_polynomial_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Generate polynomial features for specified columns"""
+        result = df.copy()
+        features_to_transform = [col for col in self.rules["polynomial_features"] 
+                               if col in df.columns]
+        
+        if not features_to_transform:
+            return result
+            
+        poly = PolynomialFeatures(
+            degree=self.rules["polynomial_degree"],
+            include_bias=False
+        )
+        
+        poly_features = poly.fit_transform(df[features_to_transform])
+        feature_names = poly.get_feature_names_out(features_to_transform)
+        
+        # Add only the interaction terms and higher degree terms
+        for i, name in enumerate(feature_names[len(features_to_transform):], 
+                               start=len(features_to_transform)):
+            result[f"poly_{name}"] = poly_features[:, i]
+        
+        return result
+    
+    def create_feature_ratios(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Generate feature ratios based on configured rules"""
+        result = df.copy()
+        
+        for rule in self.rules["feature_ratios"]:
+            if rule["enabled"]:
+                try:
+                    result[rule["name"]] = eval(rule["formula"], 
+                                              {"__builtins__": None}, 
+                                              {**dict(result), "np": np})
+                except Exception as e:
+                    print(f"Failed to calculate {rule['name']}: {str(e)}")
+        
+        return result
+    
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Apply all feature engineering transformations"""
+        result = df.copy()
+        result = self.create_health_indicators(result)
+        result = self.create_polynomial_features(result)
+        result = self.create_feature_ratios(result)
+        return result
\ No newline at end of file
diff --git a/projects/prediction/Smoking Prediction/model_deployment.py b/projects/prediction/Smoking Prediction/model_deployment.py
new file mode 100644
index 000000000..93ac94cff
--- /dev/null
+++ b/projects/prediction/Smoking Prediction/model_deployment.py	
@@ -0,0 +1,847 @@
+
+#? STAGE 8: MODEL DEPLOYMENT 
+
+from fastapi import FastAPI, HTTPException, Path, Body
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+import pandas as pd
+import joblib
+import os
+import sys
+import logging
+from datetime import datetime
+from dotenv import load_dotenv
+from fastapi.openapi.utils import get_openapi
+from sklearn.ensemble import VotingClassifier
+from sklearn.preprocessing import PolynomialFeatures
+from typing import Optional, List, Dict
+from contextlib import asynccontextmanager
+import socket
+import uvicorn
+from fastapi.openapi.docs import get_swagger_ui_html
+from fastapi.responses import HTMLResponse
+import json
+from .feature_engineering import FeatureEngineer
+
+# Configure logging to both file and console with maximum verbosity
+LOG_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'logs')
+API_LOG_DIR = os.path.join(LOG_DIR, 'api')
+DEPLOYMENT_LOG_DIR = os.path.join(LOG_DIR, 'deployment')
+ERROR_LOG_DIR = os.path.join(LOG_DIR, 'errors')
+
+# Create log directories if they don't exist
+os.makedirs(LOG_DIR, exist_ok=True)
+os.makedirs(API_LOG_DIR, exist_ok=True)
+os.makedirs(DEPLOYMENT_LOG_DIR, exist_ok=True)
+os.makedirs(ERROR_LOG_DIR, exist_ok=True)
+
+# Configure logging with organized file structure
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler(os.path.join(API_LOG_DIR, f'api_{datetime.now().strftime("%Y%m%d")}.log')),
+        logging.FileHandler(os.path.join(DEPLOYMENT_LOG_DIR, f'deployment_{datetime.now().strftime("%Y%m%d")}.log')),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+
+# Configure error logging separately
+error_handler = logging.FileHandler(os.path.join(ERROR_LOG_DIR, f'error_{datetime.now().strftime("%Y%m%d")}.log'))
+error_handler.setLevel(logging.ERROR)
+error_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+logging.getLogger().addHandler(error_handler)
+
+logger = logging.getLogger(__name__)
+
+# Load environment variables
+load_dotenv()
+
+# Define model path and dictionary to hold loaded models
+MODEL_PATH = os.getenv("MODEL_PATH", os.path.abspath(os.path.join(os.path.dirname(__file__), "../../models")))
+logger.info(f"Using model path: {MODEL_PATH}")
+models = {}
+model_parameters = {}
+
+# Define best models to be loaded for deployment
+BEST_MODELS = {
+    "ml_olympiad_improved_final": "ML Olympiad – Improved XGBoost",
+    "archive_improved_final": "Archive – Improved Ensemble"
+}
+
+# Default model parameters
+DEFAULT_MODEL_PARAMETERS = {
+    "confidence_threshold": 0.5,
+    "class_weights": {"0": 1.0, "1": 1.0},
+    "health_indicator_thresholds": {
+        "bmi": {"low": 18.5, "high": 25.0},
+        "liver_function": {"low": 10.0, "high": 50.0},
+        "cardiovascular_risk": {"low": 1.0, "high": 5.0},
+        "metabolic_index": {"low": 0.5, "high": 2.5}
+    }
+}
+
+class ModelParameters(BaseModel):
+    confidence_threshold: Optional[float] = Field(0.5, ge=0.0, le=1.0)
+    class_weights: Optional[Dict[str, float]] = Field(
+        default_factory=lambda: {"0": 1.0, "1": 1.0}
+    )
+    health_indicator_thresholds: Optional[Dict[str, Dict[str, float]]] = Field(
+        default_factory=lambda: DEFAULT_MODEL_PARAMETERS["health_indicator_thresholds"]
+    )
+
+    class Config:
+        json_schema_extra = {
+            "example": DEFAULT_MODEL_PARAMETERS
+        }
+
+# Define lifespan to load models and handle startup logging
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    try:
+        logger.info(f"Starting model loading from {MODEL_PATH}")
+        if not os.path.exists(MODEL_PATH):
+            error_msg = f"Model directory not found at {MODEL_PATH}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+            
+        model_files = [f for f in os.listdir(MODEL_PATH) if f.endswith('.pkl')]
+        logger.info(f"Found model files: {model_files}")
+        
+        if not model_files:
+            error_msg = f"No .pkl model files found in {MODEL_PATH}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+            
+        for model_file in model_files:
+            model_name = model_file.replace('.pkl', '')
+            if model_name in BEST_MODELS:
+                model_path = os.path.join(MODEL_PATH, model_file)
+                try:
+                    logger.info(f"Loading model {model_name} from {model_path}")
+                    model_artifacts = joblib.load(model_path)
+                    models[model_name] = model_artifacts['model']
+                    logger.info(f"Successfully loaded model: {model_name}")
+                except Exception as e:
+                    logger.error(f"Error loading model {model_name}: {str(e)}", exc_info=True)
+                    raise
+                    
+        if not models:
+            error_msg = f"No best models found for deployment in {MODEL_PATH}. Expected models: {list(BEST_MODELS.keys())}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+            
+        # Startup logging
+        logger.info("=== Server Starting ===")
+        logger.info(get_ip())
+        logger.info("You can access the API at:")
+        logger.info("    http://127.0.0.1:8000")
+        logger.info("    http://localhost:8000")
+        logger.info("API documentation available at:")
+        logger.info("    http://127.0.0.1:8000/docs")
+        logger.info("    http://localhost:8000/docs")
+        logger.info("Try both URLs if one doesn't work")
+        
+        logger.info("All models loaded successfully. Ready to serve.")
+    except Exception as e:
+        logger.error(f"Error during startup: {str(e)}", exc_info=True)
+        raise e
+    yield
+    # Cleanup
+    logger.info("Cleaning up models")
+    models.clear()
+
+# Initialize FastAPI app with lifespan
+app = FastAPI(
+    lifespan=lifespan,
+    title="Smoking Status Prediction API",
+    description="API for predicting smoking status using machine learning models",
+    version="2.0.0",
+    docs_url=None,
+    redoc_url=None
+)
+
+# Update CORS middleware with more specific origins and headers
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allow all origins for testing - restrict this in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+    expose_headers=["*"]
+)
+
+# Add socket info logging
+def get_ip():
+    try:
+        # Get all network interfaces
+        hostname = socket.gethostname()
+        local_ip = socket.gethostbyname(hostname)
+        return f"Hostname: {hostname}, Local IP: {local_ip}"
+    except Exception as e:
+        return f"Could not determine IP: {str(e)}"
+
+# Custom OpenAPI schema
+def custom_openapi():
+    if app.openapi_schema:
+        return app.openapi_schema
+
+    openapi_schema = get_openapi(
+        title="Smoking Status Prediction API",
+        version="2.0.0",
+        description="**API for predicting smoking status using best-performing or ensemble ML models**",
+        routes=app.routes,
+    )
+
+    # Define tags with descriptions and colors
+    openapi_schema["tags"] = [
+        {
+            "name": "Root",
+            "description": "**Root endpoint operations**",
+            "x-tag-style": {"background-color": "#FFEB3B"}
+        },
+        {
+            "name": "Models",
+            "description": "**Model listing operations**",
+            "x-tag-style": {"background-color": "#FF69B4"}
+        },
+        {
+            "name": "Health",
+            "description": "**Health check operations**",
+            "x-tag-style": {"background-color": "#4CAF50"}
+        },
+        {
+            "name": "Predictions",
+            "description": "**Smoking status prediction operations**",
+            "x-tag-style": {"background-color": "#2196F3"}
+        },
+        {
+            "name": "Feature Engineering",
+            "description": "**Feature engineering rules management**",
+            "x-tag-style": {"background-color": "#9C27B0"}
+        }
+    ]
+
+    app.openapi_schema = openapi_schema
+    return app.openapi_schema
+
+app.openapi = custom_openapi
+
+# Define input schema
+class SmokingPredictionInput(BaseModel):
+    height_cm: float = Field(..., alias="height(cm)")
+    weight_kg: float = Field(..., alias="weight(kg)")
+    waist_cm: float = Field(..., alias="waist(cm)")
+    age: float
+    ALT: float
+    Gtp: float
+    HDL: float
+    LDL: float = Field(0.0)
+    Cholesterol: float = Field(0.0)
+    systolic: float
+    relaxation: float
+    hemoglobin: float
+    serum_creatinine: float = Field(..., alias="serum creatinine")
+    triglyceride: float
+    AST: Optional[float] = Field(0.0)
+    dental_caries: Optional[int] = Field(0, alias="dental caries")
+    eyesight_right: Optional[float] = Field(0.0, alias="eyesight(right)")
+    eyesight_left: Optional[float] = Field(0.0, alias="eyesight(left)")
+    fasting_blood_sugar: Optional[float] = Field(0.0, alias="fasting blood sugar")
+
+    class Config:
+        populate_by_name = True
+        json_schema_extra = {
+            "example": {
+                "height(cm)": 170.0,
+                "weight(kg)": 70.0,
+                "waist(cm)": 85.0,
+                "eyesight(left)": 1.0,
+                "eyesight(right)": 1.0,
+                "age": 35.0,
+                "ALT": 25.0,
+                "AST": 20.0,
+                "Gtp": 30.0,
+                "HDL": 50.0,
+                "LDL": 100.0,
+                "Cholesterol": 180.0,
+                "dental caries": 0,
+                "fasting blood sugar": 90.0,
+                "relaxation": 80.0,
+                "serum creatinine": 1.0,
+                "triglyceride": 150.0,
+                "hemoglobin": 15.0,
+                "systolic": 120.0
+            }
+        }
+
+# Root endpoint with enhanced response
+@app.get("/", tags=["Root"], response_model=dict)
+async def root():
+    """Root endpoint with detailed API information and status"""
+    try:
+        network_info = get_ip()
+        logger.info(f"Root endpoint accessed. {network_info}")
+        
+        response_data = {
+            "status": "success",
+            "api_info": {
+                "name": "Enhanced Smoking Prediction API",
+                "version": "2.0.0",
+                "description": "Machine Learning API for Smoking Status Prediction"
+            },
+            "models": {
+                "available": list(models.keys()),
+                "total_count": len(models),
+                "model_path": MODEL_PATH
+            },
+            "endpoints": {
+                "documentation": "/docs",
+                "health_check": "/health",
+                "models_list": "/models",
+                "prediction": "/predict/{model_name}"
+            },
+            "server_info": {
+                "status": "healthy",
+                "network": network_info,
+                "timestamp": datetime.now().isoformat()
+            }
+        }
+        
+        return response_data
+        
+    except Exception as e:
+        error_msg = f"Error accessing root endpoint: {str(e)}"
+        logger.error(error_msg)
+        raise HTTPException(status_code=500, detail={"error": error_msg})
+
+# Health check endpoint
+@app.get("/health", tags=["Health"])
+async def health_check():
+    logger.info("Health check endpoint accessed")
+    return {
+        "status": "healthy",
+        "models_loaded": list(models.keys()),
+        "model_path": MODEL_PATH,
+        "timestamp": datetime.now().isoformat()
+    }
+
+# Endpoint to list models
+@app.get("/models", tags=["Models"])
+async def list_models():
+    logger.info("Models endpoint accessed")
+    return {
+        "available_models": BEST_MODELS,
+        "loaded_models": list(models.keys()),
+        "total": len(BEST_MODELS),
+        "model_path": MODEL_PATH
+    }
+
+# Prediction endpoint
+@app.post("/predict/{model_name}", tags=["Predictions"])
+async def predict(
+    model_name: str = Path(
+        ...,
+        description="Available models: ml_olympiad_improved_final, archive_improved_final"
+    ),
+    input_data: SmokingPredictionInput = Body(...)
+):
+    logger.info(f"Prediction requested for model: {model_name}")
+    try:
+        # Clean up model name
+        model_name = model_name.strip()
+        
+        if model_name not in models:
+            error_msg = f"Model '{model_name}' not found. Available models: {list(models.keys())}"
+            logger.error(error_msg)
+            raise HTTPException(status_code=404, detail={"error": error_msg})
+
+        # Get model parameters or use defaults
+        model_params = model_parameters.get(model_name, DEFAULT_MODEL_PARAMETERS)
+        confidence_threshold = model_params["confidence_threshold"]
+        health_thresholds = model_params["health_indicator_thresholds"]
+
+        # Convert input data to DataFrame
+        input_dict = input_data.dict(by_alias=True)
+        logger.debug(f"Raw input data: {input_dict}")
+        data = pd.DataFrame([input_dict])
+
+        try:
+            # 1. Initialize all required numeric columns with safe defaults
+            default_values = {
+                'systolic': data.get('systolic', [0.0])[0],
+                'triglyceride': data.get('triglyceride', [0.0])[0],
+                'HDL': max(data.get('HDL', [1.0])[0], 1.0),  # Ensure HDL is at least 1
+                'LDL': data.get('LDL', [0.0])[0],
+                'AST': data.get('AST', [0.0])[0],
+                'ALT': data.get('ALT', [0.0])[0],
+                'Gtp': data.get('Gtp', [0.0])[0],
+                'fasting blood sugar': data.get('fasting blood sugar', [0.0])[0]
+            }
+
+            # Update DataFrame with safe values
+            for col, value in default_values.items():
+                if pd.isna(value):
+                    data[col] = 0.0 if col != 'HDL' else 1.0
+                else:
+                    data[col] = value
+
+            logger.debug("Initialized features with safe values")
+
+            # 2. Calculate basic health indicators
+            data['bmi'] = data['weight(kg)'] / ((data['height(cm)']/100) ** 2)
+            data['liver_function'] = (data['AST'] + data['ALT'] + data['Gtp']) / 3
+            data['cardiovascular_risk'] = (data['systolic'] * data['triglyceride']) / data['HDL']
+            data['metabolic_index'] = (data['fasting blood sugar'] * data['bmi']) / data['HDL']
+
+            # 3. Calculate health status indicators
+            data['bmi_status'] = ((data['bmi'] >= health_thresholds['bmi']['low']) & 
+                               (data['bmi'] <= health_thresholds['bmi']['high'])).astype(int)
+            
+            data['liver_status'] = ((data['liver_function'] >= health_thresholds['liver_function']['low']) & 
+                                 (data['liver_function'] <= health_thresholds['liver_function']['high'])).astype(int)
+            
+            data['cv_risk_status'] = ((data['cardiovascular_risk'] >= health_thresholds['cardiovascular_risk']['low']) & 
+                                   (data['cardiovascular_risk'] <= health_thresholds['cardiovascular_risk']['high'])).astype(int)
+            
+            data['metabolic_status'] = ((data['metabolic_index'] >= health_thresholds['metabolic_index']['low']) & 
+                                    (data['metabolic_index'] <= health_thresholds['metabolic_index']['high'])).astype(int)
+
+            # 4. Calculate additional ratios
+            data['hdl_ldl_ratio'] = data['HDL'] / (data['LDL'] + 1)
+            data['ast_alt_ratio'] = data['AST'] / (data['ALT'] + 1)
+            data['bp_ratio'] = data['systolic'] / (data['relaxation'] + 1)
+
+            # 5. Generate polynomial features based on model type
+            if model_name == 'ml_olympiad_improved_final':
+                key_features = ['bmi', 'liver_function', 'cardiovascular_risk', 'metabolic_index']
+                poly = PolynomialFeatures(degree=2, include_bias=False)
+                poly_features = poly.fit_transform(data[key_features])
+                for i in range(poly_features.shape[1]):
+                    data[f'health_poly_{i}'] = poly_features[:, i]
+            else:  # archive_improved_final
+                # For archive model, we only need specific polynomial features
+                key_features = ['bmi', 'liver_function', 'cardiovascular_risk']
+                poly = PolynomialFeatures(degree=2, include_bias=False)
+                poly_features = poly.fit_transform(data[key_features])
+                # Only keep required polynomial features (0, 4, 5)
+                data['health_poly_0'] = poly_features[:, 0]  # First feature
+                data['health_poly_4'] = poly_features[:, 4]  # Fifth feature
+                data['health_poly_5'] = poly_features[:, 5]  # Sixth feature
+
+            logger.debug("All features calculated successfully")
+            logger.debug(f"Available features: {list(data.columns)}")
+
+        except Exception as e:
+            error_msg = f"Error calculating health indicators: {str(e)}"
+            logger.error(error_msg)
+            logger.error(f"Data state: {data.to_dict()}")
+            raise HTTPException(status_code=400, detail={"error": error_msg})
+
+        # Select features based on model type
+        if model_name == 'ml_olympiad_improved_final':
+            required_features = [
+                "age", "height(cm)", "weight(kg)", "systolic", "relaxation",
+                "Cholesterol", "triglyceride", "HDL", "LDL", "hemoglobin",
+                "serum creatinine", "AST", "ALT", "Gtp", "dental caries",
+                "health_poly_0", "health_poly_1", "health_poly_4", "health_poly_13",
+                "bmi", "liver_function", "hdl_ldl_ratio", "ast_alt_ratio"
+            ]
+        else:  # archive_improved_final
+            required_features = [
+                "age", "height(cm)", "weight(kg)", "waist(cm)", "systolic",
+                "relaxation", "fasting blood sugar", "triglyceride", "HDL",
+                "LDL", "hemoglobin", "serum creatinine", "ALT", "Gtp",
+                "dental caries", "health_poly_0", "health_poly_4", "health_poly_5",
+                "bmi", "liver_function", "hdl_ldl_ratio", "ast_alt_ratio"
+            ]
+
+        # Create a new DataFrame with only required features in correct order
+        prediction_data = pd.DataFrame()
+        for feature in required_features:
+            if feature not in data.columns:
+                error_msg = f"Missing required feature: {feature}"
+                logger.error(error_msg)
+                raise HTTPException(status_code=400, detail={"error": error_msg})
+            prediction_data[feature] = data[feature]
+
+        logger.debug(f"Final features for prediction: {list(prediction_data.columns)}")
+        
+        # Make prediction
+        model = models[model_name]
+        prediction = model.predict(prediction_data)[0]
+        probabilities = model.predict_proba(prediction_data)[0]
+        confidence = float(max(probabilities))
+        
+        # Apply confidence threshold
+        adjusted_prediction = 1 if confidence >= confidence_threshold and prediction == 1 else 0
+        
+        result = {
+            "model_used": BEST_MODELS[model_name],
+            "prediction": int(adjusted_prediction),
+            "label": "Smoker" if adjusted_prediction == 1 else "Non-smoker",
+            "confidence": f"{confidence:.2%}",
+            "confidence_threshold": confidence_threshold,
+            "health_indicators": {
+                "bmi_status": bool(data['bmi'].iloc[0] >= health_thresholds['bmi']['low'] and 
+                                 data['bmi'].iloc[0] <= health_thresholds['bmi']['high']),
+                "liver_status": bool(data['liver_function'].iloc[0] >= health_thresholds['liver_function']['low'] and 
+                                   data['liver_function'].iloc[0] <= health_thresholds['liver_function']['high']),
+                "cardiovascular_status": bool(data['cardiovascular_risk'].iloc[0] >= health_thresholds['cardiovascular_risk']['low'] and 
+                                           data['cardiovascular_risk'].iloc[0] <= health_thresholds['cardiovascular_risk']['high']),
+                "metabolic_status": bool(data['metabolic_index'].iloc[0] >= health_thresholds['metabolic_index']['low'] and 
+                                      data['metabolic_index'].iloc[0] <= health_thresholds['metabolic_index']['high'])
+            },
+            "calculated_features": {
+                "bmi": float(data['bmi'].iloc[0]),
+                "liver_function": float(data['liver_function'].iloc[0]),
+                "cardiovascular_risk": float(data['cardiovascular_risk'].iloc[0]),
+                "metabolic_index": float(data['metabolic_index'].iloc[0])
+            },
+            "model_type": "XGBoost" if model_name == "ml_olympiad_improved_final" else "Ensemble",
+            "features_used": required_features
+        }
+        
+        return result
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        error_msg = f"Error making prediction: {str(e)}"
+        logger.error(error_msg)
+        logger.error("Full traceback: ", exc_info=True)
+        raise HTTPException(status_code=500, detail={"error": error_msg})
+
+# Feature engineering rules models
+class FeatureEngineeringRule(BaseModel):
+    name: str
+    formula: str
+    enabled: bool = True
+    description: Optional[str] = None
+    degree: Optional[int] = Field(default=2, ge=1, le=3)
+
+class FeatureEngineeringRules(BaseModel):
+    health_indicators: List[FeatureEngineeringRule]
+    polynomial_features: List[str]
+    feature_ratios: List[FeatureEngineeringRule]
+    polynomial_degree: int = Field(default=2, ge=1, le=3)
+
+# Endpoint to update feature engineering rules
+@app.put("/feature-engineering/rules", tags=["Feature Engineering"])
+async def update_feature_engineering_rules(rules: FeatureEngineeringRules):
+    """
+    Update feature engineering rules including:
+    - Health indicator calculations
+    - Polynomial feature generation rules
+    - Feature ratio calculations
+    """
+    try:
+        # Save the rules to a configuration file
+        rules_dict = rules.dict()
+        os.makedirs("config", exist_ok=True)
+        with open("config/feature_engineering_rules.json", "w") as f:
+            json.dump(rules_dict, f, indent=4)
+        
+        return {
+            "status": "success",
+            "message": "Feature engineering rules updated successfully",
+            "rules": rules_dict
+        }
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to update feature engineering rules: {str(e)}"
+        )
+
+@app.get("/docs", include_in_schema=False)
+async def custom_swagger_ui_html():
+    html_content = """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui.css">
+        <link rel="shortcut icon" href="/favicon.ico">
+        <title>Smoking Status Prediction API - Swagger UI</title>
+        <style>
+            /* Hide the filter input */
+            .swagger-ui .filter-container {
+                display: none !important;
+            }
+            
+            /* API Description and Tag Description Styling */
+            .title, .description, .opblock-tag-section h3 span, .opblock-tag-section .markdown p {
+                font-weight: bold !important;
+                font-size: calc(100% + 2pt) !important;
+            }
+            .info__title {
+                font-weight: bold !important;
+                font-size: calc(100% + 4pt) !important;
+            }
+
+            /* Method button styling */
+            .swagger-ui .opblock-summary-method {
+                min-width: 80px !important;
+                text-align: center !important;
+                border-radius: 3px !important;
+                padding: 6px 15px !important;
+            }
+
+            /* Root endpoint (Yellow) */
+            .swagger-ui #operations-Root-get .opblock-summary-method,
+            .swagger-ui #operations-Root-get .btn,
+            .swagger-ui #operations-Root-get .execute,
+            .swagger-ui #operations-Root-get .try-out__btn {
+                background: #FFD700 !important;
+                border-color: #FFD700 !important;
+                color: #000000 !important;
+            }
+            .swagger-ui #operations-Root-get.is-open .opblock-summary {
+                border-color: #FFD700 !important;
+            }
+
+            /* Models endpoint (Purple) */
+            .swagger-ui #operations-Models-get .opblock-summary-method,
+            .swagger-ui #operations-Models-get .btn,
+            .swagger-ui #operations-Models-get .execute,
+            .swagger-ui #operations-Models-get .try-out__btn {
+                background: #9B59B6 !important;
+                border-color: #9B59B6 !important;
+                color: #FFFFFF !important;
+            }
+            .swagger-ui #operations-Models-get.is-open .opblock-summary {
+                border-color: #9B59B6 !important;
+            }
+
+            /* Health endpoint (Green) */
+            .swagger-ui #operations-Health-get .opblock-summary-method,
+            .swagger-ui #operations-Health-get .btn,
+            .swagger-ui #operations-Health-get .execute,
+            .swagger-ui #operations-Health-get .try-out__btn {
+                background: #2ECC71 !important;
+                border-color: #2ECC71 !important;
+                color: #FFFFFF !important;
+            }
+            .swagger-ui #operations-Health-get.is-open .opblock-summary {
+                border-color: #2ECC71 !important;
+            }
+
+            /* Predictions endpoint (Orange) */
+            .swagger-ui #operations-Predictions-post .opblock-summary-method,
+            .swagger-ui #operations-Predictions-post .btn,
+            .swagger-ui #operations-Predictions-post .execute,
+            .swagger-ui #operations-Predictions-post .try-out__btn {
+                background: #E67E22 !important;
+                border-color: #E67E22 !important;
+                color: #FFFFFF !important;
+            }
+            .swagger-ui #operations-Predictions-post.is-open .opblock-summary {
+                border-color: #E67E22 !important;
+            }
+
+            /* Feature Engineering endpoint (Pink) */
+            .swagger-ui #operations-FeatureEngineering-put .opblock-summary-method,
+            .swagger-ui #operations-FeatureEngineering-put .btn,
+            .swagger-ui #operations-FeatureEngineering-put .execute,
+            .swagger-ui #operations-FeatureEngineering-put .try-out__btn {
+                background: #FF1493 !important;
+                border-color: #FF1493 !important;
+                color: #FFFFFF !important;
+            }
+            .swagger-ui #operations-FeatureEngineering-put.is-open .opblock-summary {
+                border-color: #FF1493 !important;
+            }
+
+            /* Hide operation IDs */
+            .swagger-ui .opblock-summary-operation-id {
+                display: none !important;
+            }
+
+            /* Button hover effects */
+            .swagger-ui #operations-Root-get .opblock-summary-method:hover,
+            .swagger-ui #operations-Root-get .btn:hover {
+                background: #FFE44D !important;
+            }
+            .swagger-ui #operations-Models-get .opblock-summary-method:hover,
+            .swagger-ui #operations-Models-get .btn:hover {
+                background: #A569BD !important;
+            }
+            .swagger-ui #operations-Health-get .opblock-summary-method:hover,
+            .swagger-ui #operations-Health-get .btn:hover {
+                background: #27AE60 !important;
+            }
+            .swagger-ui #operations-Predictions-post .opblock-summary-method:hover,
+            .swagger-ui #operations-Predictions-post .btn:hover {
+                background: #D35400 !important;
+            }
+            .swagger-ui #operations-FeatureEngineering-put .opblock-summary-method:hover,
+            .swagger-ui #operations-FeatureEngineering-put .btn:hover {
+                background: #FF69B4 !important;
+            }
+
+            /* Active button styles */
+            .swagger-ui .try-out__btn:active {
+                box-shadow: 0 0 5px rgba(0, 0, 0, 0.2) !important;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="swagger-ui"></div>
+        <script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui-bundle.js"></script>
+        <script>
+            window.onload = () => {
+                const ui = SwaggerUIBundle({
+                    url: '/openapi.json',
+                    dom_id: '#swagger-ui',
+                    deepLinking: true,
+                    displayRequestDuration: true,
+                    filter: false,
+                    operationsSorter: 'alpha',
+                    presets: [
+                        SwaggerUIBundle.presets.apis,
+                        SwaggerUIBundle.SwaggerUIStandalonePreset
+                    ],
+                    plugins: [
+                        SwaggerUIBundle.plugins.DownloadUrl
+                    ]
+                });
+
+                // Additional styling for buttons after UI loads
+                setTimeout(() => {
+                    const applyColors = () => {
+                        // Root endpoint (Yellow)
+                        const rootElements = document.querySelectorAll('#operations-Root-get button');
+                        rootElements.forEach(el => {
+                            el.style.setProperty('background', '#FFD700', 'important');
+                            el.style.setProperty('border-color', '#FFD700', 'important');
+                            el.style.setProperty('color', '#000000', 'important');
+                        });
+
+                        // Models endpoint (Purple)
+                        const modelsElements = document.querySelectorAll('#operations-Models-get button');
+                        modelsElements.forEach(el => {
+                            el.style.setProperty('background', '#9B59B6', 'important');
+                            el.style.setProperty('border-color', '#9B59B6', 'important');
+                            el.style.setProperty('color', '#FFFFFF', 'important');
+                        });
+
+                        // Health endpoint (Green)
+                        const healthElements = document.querySelectorAll('#operations-Health-get button');
+                        healthElements.forEach(el => {
+                            el.style.setProperty('background', '#2ECC71', 'important');
+                            el.style.setProperty('border-color', '#2ECC71', 'important');
+                            el.style.setProperty('color', '#FFFFFF', 'important');
+                        });
+
+                        // Predictions endpoint (Orange)
+                        const predictionElements = document.querySelectorAll('#operations-Predictions-post button');
+                        predictionElements.forEach(el => {
+                            el.style.setProperty('background', '#E67E22', 'important');
+                            el.style.setProperty('border-color', '#E67E22', 'important');
+                            el.style.setProperty('color', '#FFFFFF', 'important');
+                        });
+                        
+                        // Feature Engineering endpoint (Pink)
+                        const featureElements = document.querySelectorAll('#operations-FeatureEngineering-put button');
+                        featureElements.forEach(el => {
+                            el.style.setProperty('background', '#FF1493', 'important');
+                            el.style.setProperty('border-color', '#FF1493', 'important');
+                            el.style.setProperty('color', '#FFFFFF', 'important');
+                        });
+                    };
+
+                    // Apply colors initially
+                    applyColors();
+
+                    // Reapply colors when sections are expanded
+                    const observer = new MutationObserver(applyColors);
+                    observer.observe(document.getElementById('swagger-ui'), {
+                        childList: true,
+                        subtree: true
+                    });
+                }, 100);
+            };
+        </script>
+    </body>
+    </html>
+    """
+    return HTMLResponse(content=html_content)
+
+
+class ModelDeployment:
+    def __init__(self):
+        self.feature_engineer = FeatureEngineer()
+        self.model = models.get('smoking_status', None)  # Assuming models is defined elsewhere
+    
+    async def predict(self, data: Dict):
+        """Make predictions using the deployed model"""
+        try:
+            # Convert input data to DataFrame
+            df = pd.DataFrame([data])
+            
+            # Apply feature engineering
+            df = self.feature_engineer.transform(df)
+            
+            # Make prediction
+            prediction = self.model.predict(df)[0]
+            probability = self.model.predict_proba(df)[0][1]
+            
+            return {
+                "prediction": int(prediction),
+                "probability": float(probability),
+                "status": "success"
+            }
+        except Exception as e:
+            logger.error(f"Prediction failed: {str(e)}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Prediction failed: {str(e)}"
+            )
+
+
+@app.patch("/models/{model_name}/parameters", tags=["Models"])
+async def update_model_parameters(
+    model_name: str = Path(
+        ...,
+        description="Model name to update parameters for"
+    ),
+    parameters: ModelParameters = Body(...)
+):
+    """
+    Update model parameters including:
+    - Confidence threshold for predictions
+    - Class weights for model predictions
+    - Health indicator thresholds
+    """
+    try:
+        if model_name not in models:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Model '{model_name}' not found. Available models: {list(models.keys())}"
+            )
+
+        # Initialize parameters for model if not exists
+        if model_name not in model_parameters:
+            model_parameters[model_name] = DEFAULT_MODEL_PARAMETERS.copy()
+        
+        # Update only provided parameters
+        updated_params = parameters.dict(exclude_unset=True)
+        model_parameters[model_name].update(updated_params)
+        
+        logger.info(f"Updated parameters for model {model_name}: {updated_params}")
+        
+        return {
+            "status": "success",
+            "message": f"Parameters updated successfully for model: {model_name}",
+            "model": model_name,
+            "updated_parameters": updated_params,
+            "current_parameters": model_parameters[model_name]
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        error_msg = f"Failed to update model parameters: {str(e)}"
+        logger.error(error_msg)
+        raise HTTPException(status_code=500, detail={"error": error_msg})
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
\ No newline at end of file
diff --git a/projects/prediction/Smoking Prediction/model_evaluation.py b/projects/prediction/Smoking Prediction/model_evaluation.py
new file mode 100644
index 000000000..129dee5c8
--- /dev/null
+++ b/projects/prediction/Smoking Prediction/model_evaluation.py	
@@ -0,0 +1,213 @@
+
+#? STAGE 6: MODEL EVALUATION
+
+
+import os
+import json
+import joblib
+import numpy as np
+import pandas as pd
+# Set the backend to 'Agg' before importing matplotlib
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    roc_auc_score, confusion_matrix, classification_report,
+    roc_curve, precision_recall_curve
+)
+
+def convert_to_python_types(d):
+    """Convert NumPy types to native Python types for JSON serialization"""
+    if isinstance(d, dict):
+        return {k: convert_to_python_types(v) for k, v in d.items()}
+    elif isinstance(d, (np.integer)):  # Updated for NumPy 2.0+
+        return int(d)
+    elif isinstance(d, (np.floating)):  # Updated for NumPy 2.0+
+        return float(d)
+    elif isinstance(d, (np.ndarray, pd.Series)):
+        return convert_to_python_types(d.tolist())
+    elif isinstance(d, list):
+        return [convert_to_python_types(i) for i in d]
+    else:
+        return d
+
+def evaluate_model(model, X_test, y_test, model_name, features):
+    """
+    Evaluate model performance and generate visualizations
+    """
+    # Make predictions
+    y_pred = model.predict(X_test)
+    y_pred_proba = model.predict_proba(X_test)[:, 1]
+
+    # Calculate metrics
+    metrics = {
+        'accuracy': float(accuracy_score(y_test, y_pred)),
+        'precision': float(precision_score(y_test, y_pred)),
+        'recall': float(recall_score(y_test, y_pred)),
+        'f1': float(f1_score(y_test, y_pred)),
+        'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
+    }
+
+    # Create visualizations directory
+    os.makedirs('artifacts/visualizations', exist_ok=True)
+
+    # Plot ROC Curve
+    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
+    plt.figure(figsize=(10, 6))
+    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {metrics["roc_auc"]:.2f})')
+    plt.plot([0, 1], [0, 1], 'k--', label='Random')
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title(f'ROC Curve - {model_name}')
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(f'artifacts/visualizations/roc_curve_{model_name}.png')
+    plt.close()
+
+    # Plot Confusion Matrix
+    cm = confusion_matrix(y_test, y_pred)
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
+    plt.xlabel('Predicted')
+    plt.ylabel('Actual')
+    plt.title(f'Confusion Matrix - {model_name}')
+    plt.savefig(f'artifacts/visualizations/confusion_matrix_{model_name}.png')
+    plt.close()
+
+    # Enhanced error analysis
+    errors_df = pd.DataFrame({
+        'Actual': y_test,
+        'Predicted': y_pred,
+        'Probability': y_pred_proba
+    })
+    errors_df['Error_Type'] = 'Correct'
+    errors_df.loc[(errors_df['Actual'] == 1) & (errors_df['Predicted'] == 0), 'Error_Type'] = 'False Negative'
+    errors_df.loc[(errors_df['Actual'] == 0) & (errors_df['Predicted'] == 1), 'Error_Type'] = 'False Positive'
+    
+    # Add feature values for error analysis
+    errors_df = pd.concat([errors_df, X_test.reset_index(drop=True)], axis=1)
+    
+    # Save error analysis with converted types
+    error_analysis = {
+        'false_positives': {
+            'count': int(len(errors_df[errors_df['Error_Type'] == 'False Positive'])),
+            'avg_probability': float(errors_df[errors_df['Error_Type'] == 'False Positive']['Probability'].mean()),
+            'feature_means': convert_to_python_types(
+                errors_df[errors_df['Error_Type'] == 'False Positive'][features].mean().to_dict()
+            )
+        },
+        'false_negatives': {
+            'count': int(len(errors_df[errors_df['Error_Type'] == 'False Negative'])),
+            'avg_probability': float(errors_df[errors_df['Error_Type'] == 'False Negative']['Probability'].mean()),
+            'feature_means': convert_to_python_types(
+                errors_df[errors_df['Error_Type'] == 'False Negative'][features].mean().to_dict()
+            )
+        }
+    }
+    
+    # Save detailed error analysis
+    with open(f'artifacts/visualizations/error_analysis_{model_name}.json', 'w') as f:
+        json.dump(error_analysis, f, indent=4)
+    
+    # Plot confusion matrix with percentages
+    plt.figure(figsize=(10, 8))
+    cm_percent = confusion_matrix(y_test, y_pred, normalize='true') * 100
+    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap='Blues',
+                xticklabels=['Non-Smoker', 'Smoker'],
+                yticklabels=['Non-Smoker', 'Smoker'])
+    plt.xlabel('Predicted')
+    plt.ylabel('Actual')
+    plt.title(f'Confusion Matrix (%) - {model_name}')
+    plt.savefig(f'artifacts/visualizations/confusion_matrix_percent_{model_name}.png')
+    plt.close()
+
+    # Feature Importance Plot (if available)
+    if hasattr(model, 'feature_importances_'):
+        importances = pd.DataFrame({
+            'feature': features,
+            'importance': [float(i) for i in model.feature_importances_]  # Convert to Python float
+        }).sort_values('importance', ascending=False)
+
+        # Plot top 20 features
+        plt.figure(figsize=(12, 8))
+        top_20_features = importances.head(20)
+        sns.barplot(data=top_20_features, x='importance', y='feature')
+        plt.title(f'Top 20 Feature Importance - {model_name}')
+        plt.xlabel('Importance')
+        plt.tight_layout()
+        plt.savefig(f'artifacts/visualizations/feature_importance_{model_name}.png')
+        plt.close()
+
+        # Save complete feature importance to JSON
+        importance_dict = {k: float(v) for k, v in importances.set_index('feature')['importance'].to_dict().items()}
+        with open(f'artifacts/visualizations/feature_importance_{model_name}.json', 'w') as f:
+            json.dump(importance_dict, f, indent=4)
+
+    return metrics
+
+def main():
+    # Load model information
+    with open('artifacts/models/model_info.json', 'r') as f:
+        model_info = json.load(f)
+
+    # Paths to test datasets
+    dataset_paths = {
+        'ml_olympiad': 'Y:/SmokingML V2/data/processed/ml_olympiad_test.csv',
+        'archive': 'Y:/SmokingML V2/data/processed/archive_test.csv'
+    }
+
+    evaluation_results = {}
+
+    for dataset_name, test_path in dataset_paths.items():
+        print(f"\nEvaluating model for {dataset_name} dataset...")
+        
+        # Load the model
+        model_path = model_info[dataset_name]['model_path']
+        model = joblib.load(model_path)
+        
+        # Load test data
+        test_df = pd.read_csv(test_path)
+        
+        # Get dataset-specific features from model info
+        features = model_info[dataset_name]['features']
+        
+        # Get features and target
+        X_test = test_df[features]
+        y_test = test_df['smoking']
+        
+        print(f"Number of features being used for {dataset_name}: {len(features)}")
+        
+        # Evaluate model
+        metrics = evaluate_model(
+            model, 
+            X_test, 
+            y_test, 
+            f"{dataset_name}_{model_info[dataset_name]['name']}",
+            features
+        )
+        
+        # Store results
+        evaluation_results[dataset_name] = {
+            'model_name': model_info[dataset_name]['name'],
+            'metrics': metrics,
+            'num_features': len(features),
+            'features': features
+        }
+        
+        print(f"\nResults for {dataset_name}:")
+        print(f"Model: {model_info[dataset_name]['name']}")
+        print(f"Number of features: {len(features)}")
+        for metric, value in metrics.items():
+            print(f"{metric}: {value:.4f}")
+
+    # Save evaluation results
+    with open('artifacts/models/evaluation_results.json', 'w') as f:
+        json.dump(evaluation_results, f, indent=4)
+        
+    print("\nEvaluation completed! Results saved to artifacts/models/evaluation_results.json")
+    print("Visualizations saved to artifacts/visualizations/")
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/prediction/Smoking Prediction/model_improvements.py b/projects/prediction/Smoking Prediction/model_improvements.py
new file mode 100644
index 000000000..5de77ff0e
--- /dev/null
+++ b/projects/prediction/Smoking Prediction/model_improvements.py	
@@ -0,0 +1,309 @@
+
+#? STAGE 7: MODEL IMPROVEMENTS
+
+import os
+import json
+import joblib
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')  # Set non-interactive backend before other matplotlib imports
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.preprocessing import StandardScaler, PolynomialFeatures
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
+                           roc_auc_score, confusion_matrix, precision_recall_curve, 
+                           roc_curve, auc)
+from sklearn.ensemble import RandomForestClassifier, VotingClassifier
+from sklearn.feature_selection import SelectFromModel
+from xgboost import XGBClassifier
+from imblearn.over_sampling import SMOTE
+from imblearn.pipeline import Pipeline
+
+def custom_scorer(y_true, y_pred):
+    """Custom scorer that emphasizes precision while maintaining other metrics"""
+    precision = precision_score(y_true, y_pred)
+    recall = recall_score(y_true, y_pred)
+    f1 = f1_score(y_true, y_pred)
+    # Weight precision more heavily
+    return (2 * precision + recall + f1) / 4
+
+def create_advanced_features(df):
+    """Create advanced feature set with enhanced interactions"""
+    # Original features except 'smoking'
+    original_features = [col for col in df.columns if col != 'smoking']
+    
+    # Enhanced health indicators
+    df['bmi'] = df['weight(kg)'] / ((df['height(cm)']/100) ** 2)
+    df['liver_function'] = (df['AST'] + df['ALT'] + df['Gtp']) / 3
+    df['cardiovascular_risk'] = (df['systolic'] * df['triglyceride']) / (df['HDL'] + 1)
+    df['metabolic_index'] = df['fasting blood sugar'] * df['bmi'] / (df['HDL'] + 1)
+    df['age_health_index'] = df['age'] * df['hemoglobin'] / df['liver_function']
+    
+    # Polynomial features for key health indicators
+    poly = PolynomialFeatures(degree=2, include_bias=False)
+    key_features = ['bmi', 'liver_function', 'cardiovascular_risk', 'metabolic_index']
+    poly_features = poly.fit_transform(df[key_features])
+    poly_names = [f'health_poly_{i}' for i in range(poly_features.shape[1])]
+    df[poly_names] = poly_features
+    
+    # Feature ratios
+    df['hdl_ldl_ratio'] = df['HDL'] / (df['LDL'] + 1)
+    df['ast_alt_ratio'] = df['AST'] / (df['ALT'] + 1)
+    df['bp_ratio'] = df['systolic'] / (df['relaxation'] + 1)
+    
+    # All features
+    all_features = (
+        original_features + 
+        poly_names + 
+        ['bmi', 'liver_function', 'cardiovascular_risk', 'metabolic_index', 
+         'age_health_index', 'hdl_ldl_ratio', 'ast_alt_ratio', 'bp_ratio']
+    )
+    
+    # Normalize features
+    scaler = StandardScaler()
+    df[all_features] = scaler.fit_transform(df[all_features])
+    
+    return df[all_features]
+
+def create_efficient_ensemble(dataset_name):
+    """Create an enhanced voting ensemble with XGBoost and Random Forest"""
+    if dataset_name == 'archive':
+        rf = RandomForestClassifier(
+            n_estimators=1200,
+            max_depth=10,
+            min_samples_split=8,
+            min_samples_leaf=6,
+            max_features=0.7,
+            min_impurity_decrease=0.004,
+            class_weight={0: 1.2, 1: 1},  # Reduced class weight difference
+            criterion='entropy',
+            random_state=42,
+            n_jobs=-1,
+            bootstrap=True,
+            oob_score=True,
+            max_samples=0.85
+        )
+        
+        xgb = XGBClassifier(
+            max_depth=7,
+            learning_rate=0.03,
+            n_estimators=400,
+            min_child_weight=3,
+            gamma=0.15,
+            subsample=0.85,
+            colsample_bytree=0.85,
+            scale_pos_weight=1.2,  # Reduced scale weight
+            random_state=42,
+            n_jobs=-1
+        )
+        
+        return VotingClassifier(
+            estimators=[
+                ('rf', rf),
+                ('xgb', xgb)
+            ],
+            voting='soft',
+            weights=[0.5, 0.5]  # Equal weights for better balance
+        )
+    else:
+        # Keep existing XGBoost for ml_olympiad
+        return XGBClassifier(
+            max_depth=5,
+            learning_rate=0.1,
+            n_estimators=100,
+            min_child_weight=3,
+            gamma=0.1,
+            subsample=0.9,
+            colsample_bytree=0.8,
+            random_state=42,
+            n_jobs=-1,
+            tree_method='hist',
+            eval_metric='logloss',
+            enable_categorical=False
+        )
+
+def select_best_features(X, y, threshold=0.55):  # Adjusted threshold
+    """Select best features using enhanced selection"""
+    selector = SelectFromModel(
+        estimator=XGBClassifier(
+            n_estimators=250,
+            max_depth=6,
+            learning_rate=0.02,
+            subsample=0.85,
+            colsample_bytree=0.85,
+            min_child_weight=4,
+            random_state=42,
+            n_jobs=-1
+        ),
+        threshold=threshold
+    )
+    selector.fit(X, y)
+    return selector
+
+def create_visualizations(y_true, y_pred, y_pred_proba, dataset_name, model_name):
+    """Create and save visualization plots for model evaluation"""
+    os.makedirs('artifacts/visualizations', exist_ok=True)
+    
+    # Confusion Matrix
+    plt.figure(figsize=(10, 8))
+    cm = confusion_matrix(y_true, y_pred)
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
+    plt.title(f'Confusion Matrix - {dataset_name} ({model_name})')
+    plt.ylabel('True Label')
+    plt.xlabel('Predicted Label')
+    plt.savefig(f'artifacts/visualizations/confusion_matrix_{dataset_name}_{model_name}.png')
+    plt.close()
+
+    # Percentage Confusion Matrix
+    plt.figure(figsize=(10, 8))
+    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
+    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap='Blues')
+    plt.title(f'Confusion Matrix (%) - {dataset_name} ({model_name})')
+    plt.ylabel('True Label')
+    plt.xlabel('Predicted Label')
+    plt.savefig(f'artifacts/visualizations/confusion_matrix_percent_{dataset_name}_{model_name}.png')
+    plt.close()
+
+    # ROC Curve
+    plt.figure(figsize=(10, 8))
+    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
+    roc_auc = auc(fpr, tpr)
+    plt.plot(fpr, tpr, color='darkorange', lw=2, 
+             label=f'ROC curve (AUC = {roc_auc:.2f})')
+    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title(f'ROC Curve - {dataset_name} ({model_name})')
+    plt.legend(loc="lower right")
+    plt.savefig(f'artifacts/visualizations/roc_curve_{dataset_name}_{model_name}.png')
+    plt.close()
+
+    # Precision-Recall Curve
+    plt.figure(figsize=(10, 8))
+    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
+    pr_auc = auc(recall, precision)
+    plt.plot(recall, precision, color='darkorange', lw=2,
+             label=f'PR curve (AUC = {pr_auc:.2f})')
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.title(f'Precision-Recall Curve - {dataset_name} ({model_name})')
+    plt.legend(loc="lower right")
+    plt.savefig(f'artifacts/visualizations/pr_curve_{dataset_name}_{model_name}.png')
+    plt.close()
+
+def main():
+    """Main function to train and evaluate improved models"""
+    dataset_paths = {
+        'ml_olympiad': {
+            'train': 'Y:/SmokingML V2/data/processed/ml_olympiad_train.csv',
+            'test': 'Y:/SmokingML V2/data/processed/ml_olympiad_test.csv'
+        },
+        'archive': {
+            'train': 'Y:/SmokingML V2/data/processed/archive_train.csv',
+            'test': 'Y:/SmokingML V2/data/processed/archive_test.csv'
+        }
+    }
+
+    print("Using dataset paths:")
+    for dataset, paths in dataset_paths.items():
+        print(f"{dataset}:")
+        print(f"  Train: {paths['train']}")
+        print(f"  Test:  {paths['test']}")
+
+    results = {}
+    
+    for dataset_name, paths in dataset_paths.items():
+        print(f"\nImproving model for {dataset_name} dataset...")
+        
+        print("Loading data...")
+        # Load data
+        train_df = pd.read_csv(paths['train'])
+        test_df = pd.read_csv(paths['test'])
+        print(f"Loaded training data shape: {train_df.shape}")
+        print(f"Loaded test data shape: {test_df.shape}")
+        
+        print("Creating advanced features...")
+        # Create advanced features
+        X_train = create_advanced_features(train_df)
+        y_train = train_df['smoking']
+        X_test = create_advanced_features(test_df)
+        y_test = test_df['smoking']
+        print(f"Features created. Training features shape: {X_train.shape}")
+        
+        print("Selecting best features...")
+        # Feature selection
+        selector = select_best_features(X_train, y_train, threshold='median')
+        X_train_selected = selector.transform(X_train)
+        X_test_selected = selector.transform(X_test)
+        print(f"Selected {X_train_selected.shape[1]} features")
+        
+        # Apply modified SMOTE for archive dataset
+        if dataset_name == 'archive':
+            print("Applying SMOTE resampling...")
+            smote = SMOTE(
+                random_state=42,
+                k_neighbors=5,
+                sampling_strategy=0.85
+            )
+            X_train_selected, y_train = smote.fit_resample(X_train_selected, y_train)
+            print(f"After SMOTE - Training data shape: {X_train_selected.shape}")
+        
+        print("Training model...")
+        # Create and train model
+        model = create_efficient_ensemble(dataset_name)
+        model.fit(X_train_selected, y_train)
+        
+        print("Making predictions...")
+        # Get predictions
+        y_pred = model.predict(X_test_selected)
+        y_pred_proba = model.predict_proba(X_test_selected)[:, 1]
+        
+        print("Calculating metrics...")
+        # Calculate metrics
+        metrics = {
+            'accuracy': float(accuracy_score(y_test, y_pred)),
+            'precision': float(precision_score(y_test, y_pred)),
+            'recall': float(recall_score(y_test, y_pred)),
+            'f1': float(f1_score(y_test, y_pred)),
+            'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
+        }
+        
+        print("Creating visualizations...")
+        # Create visualizations
+        model_name = 'Ensemble' if dataset_name == 'archive' else 'XGBoost'
+        create_visualizations(y_test, y_pred, y_pred_proba, dataset_name, model_name)
+        
+        # Save results
+        results[dataset_name] = {
+            'metrics': metrics,
+            'n_features_selected': int(X_train_selected.shape[1]),
+            'features': list(X_train.columns[selector.get_support()])
+        }
+        
+        # Save model and feature selector
+        model_artifacts = {
+            'model': model,
+            'selector': selector,
+            'feature_names': list(X_train.columns)
+        }
+        model_path = f"models/{dataset_name}_improved_final.pkl"
+        joblib.dump(model_artifacts, model_path)
+        print(f"Saved improved model to {model_path}")
+        
+        print(f"\nFinal Results for {dataset_name}:")
+        for metric, value in metrics.items():
+            print(f"{metric}: {value:.4f}")
+    
+    # Save results
+    os.makedirs('artifacts/improvements', exist_ok=True)
+    with open('artifacts/improvements/final_results.json', 'w') as f:
+        json.dump(results, f, indent=4)
+    
+    print("\nFinal improvements completed! Results saved to artifacts/improvements/final_results.json")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/projects/prediction/Smoking Prediction/model_optimization.py b/projects/prediction/Smoking Prediction/model_optimization.py
new file mode 100644
index 000000000..aecf654d9
--- /dev/null
+++ b/projects/prediction/Smoking Prediction/model_optimization.py	
@@ -0,0 +1,211 @@
+
+#  ? STAGE 5: MODEL OPTIMIZATION
+
+
+import os
+import json
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import RandomizedSearchCV, cross_val_score
+import scipy.stats as stats
+from sklearn.ensemble import RandomForestClassifier
+from xgboost import XGBClassifier
+from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
+
+def create_balanced_scorer(dataset_name):
+    """Create a custom scorer that ensures all metrics improve for the specific dataset"""
+    def balanced_scorer(y_true, y_pred):
+        # Calculate all metrics
+        acc = accuracy_score(y_true, y_pred)
+        prec = precision_score(y_true, y_pred)
+        rec = recall_score(y_true, y_pred)
+        f1 = f1_score(y_true, y_pred)
+        
+        # Select thresholds based on dataset
+        if dataset_name == 'ml_olympiad':
+            thresholds = {
+                'accuracy': 0.7777,
+                'precision': 0.7203,
+                'recall': 0.7981,
+                'f1': 0.7572
+            }
+        else:  # archive dataset
+            thresholds = {
+                'accuracy': 0.7724,
+                'precision': 0.6957,
+                'recall': 0.6768,
+                'f1': 0.6861
+            }
+        
+        # Calculate improvements relative to thresholds
+        acc_imp = (acc - thresholds['accuracy'])
+        prec_imp = (prec - thresholds['precision'])
+        rec_imp = (rec - thresholds['recall'])
+        f1_imp = (f1 - thresholds['f1'])
+        
+        # If any metric is below threshold, heavily penalize
+        if acc < thresholds['accuracy'] or prec < thresholds['precision'] or \
+           rec < thresholds['recall'] or f1 < thresholds['f1']:
+            return -100.0  # Strong penalty for any decrease
+        
+        # Otherwise, reward based on minimum improvement
+        min_improvement = min(acc_imp, prec_imp, rec_imp, f1_imp)
+        avg_improvement = (acc_imp + prec_imp + rec_imp + f1_imp) / 4
+        
+        # Combine minimum and average improvements
+        # This ensures we prioritize solutions where all metrics improve
+        return min_improvement + avg_improvement
+    
+    return make_scorer(balanced_scorer, greater_is_better=True)
+
+def optimize_model(X, y, model_type, param_grid, dataset_name):
+    """Optimize model hyperparameters using GridSearchCV with strict improvement requirements"""
+    # Create dataset-specific balanced scorer
+    balanced_scorer = create_balanced_scorer(dataset_name)
+    
+    # Perform grid search with reduced CV to speed up search
+    search = RandomizedSearchCV(
+        estimator=model_type,
+        param_distributions=param_grid,
+        n_iter=30,
+        scoring=balanced_scorer,
+        cv=3,
+        verbose=2,
+        random_state=42,
+        n_jobs=-1,
+        error_score='raise'
+    )
+    
+    search.fit(X, y)
+    
+    # Get predictions using best model
+    y_pred = search.best_estimator_.predict(X)
+    
+    # Calculate metrics
+    metrics = {
+        'accuracy': accuracy_score(y, y_pred),
+        'precision': precision_score(y, y_pred),
+        'recall': recall_score(y, y_pred),
+        'f1': f1_score(y, y_pred)
+    }
+    
+    return search.best_estimator_,{
+        'best_params': search.best_params_,
+        'best_score': float(search.best_score_),
+        'cv_results': metrics
+    }
+    
+
+def main():
+    """Main function to optimize models"""
+    # Load data
+    dataset_paths = {
+        'ml_olympiad': {
+            'train': 'Y:/SmokingML V2/data/processed/ml_olympiad_train.csv',
+            'test': 'Y:/SmokingML V2/data/processed/ml_olympiad_test.csv'
+        },
+        'archive': {
+            'train': 'Y:/SmokingML V2/data/processed/archive_train.csv',
+            'test': 'Y:/SmokingML V2/data/processed/archive_test.csv'
+        }
+    }
+
+    # Load model information
+    with open('artifacts/models/model_info.json', 'r') as f:
+        model_info = json.load(f)
+
+    # Updated parameter grids with more focused ranges
+    param_grids = {
+        'XGBoost': {
+            'max_depth': [5, 6, 7],
+            'learning_rate': stats.uniform(0.01, 0.1),
+            'n_estimators': [300, 400, 500],
+            'min_child_weight': [2, 3, 4],
+            'gamma': stats.uniform(0, 0.3),
+            'subsample': stats.uniform(0.7, 0.3),
+            'colsample_bytree': stats.uniform(0.7, 0.3),
+            'scale_pos_weight': [1.0, 1.1],
+            'reg_alpha': stats.uniform(0.0, 0.3),
+            'reg_lambda': stats.uniform(1.0, 2.0),
+        },
+        'Random_Forest': {
+            'n_estimators': [200, 300],  # Reduced from 3 values to 2
+            'max_depth': [15, 20],       # Still allows deep trees but not overly large
+            'min_samples_split': [4],    # Fixed to one optimal value
+            'min_samples_leaf': [2],     # Fixed to one optimal value
+            'max_features': ['sqrt'],    # Typically best for classification
+            'class_weight': ['balanced'] # Good for imbalance handling
+        }
+    }
+
+    optimization_results = {}
+
+    for dataset_name, paths in dataset_paths.items():
+        print(f"\nOptimizing model for {dataset_name} dataset...")
+        
+        # Load training data
+        train_df = pd.read_csv(paths['train'])
+        test_df = pd.read_csv(paths['test'])
+        
+        # Get features from model info
+        features = model_info[dataset_name]['features']
+        
+        # Prepare data
+        X_train = train_df[features]
+        y_train = train_df['smoking']
+        X_test = test_df[features]
+        y_test = test_df['smoking']
+        
+        # Select model type and parameter grid
+        model_name = model_info[dataset_name]['name']
+        # Using XGBoost for both datasets
+        model_type = XGBClassifier(
+            random_state=42,
+            eval_metric='logloss',
+            enable_categorical=False
+        )
+        param_grid = param_grids['XGBoost']  # Use same parameter grid for both datasets
+
+        
+        # Optimize model
+        print(f"Performing grid search with cross-validation for {model_name}...")
+        best_model, cv_results = optimize_model(X_train, y_train, model_type, param_grid, dataset_name)
+        
+        # Evaluate on test set
+        y_pred = best_model.predict(X_test)
+        test_metrics = {
+            'accuracy': accuracy_score(y_test, y_pred),
+            'precision': precision_score(y_test, y_pred),
+            'recall': recall_score(y_test, y_pred),
+            'f1': f1_score(y_test, y_pred)
+        }
+        
+        # Save optimized model
+        model_path = f"models/{dataset_name}_{model_name}_optimized.pkl"
+        joblib.dump(best_model, model_path)
+        
+        # Store results
+        optimization_results[dataset_name] = {
+            'model_name': model_name,
+            'best_params': cv_results['best_params'],
+            'cv_scores': cv_results['cv_results'],
+            'test_scores': test_metrics,
+            'model_path': model_path
+        }
+        
+        print(f"\nOptimization results for {dataset_name}:")
+        print(f"Best parameters: {cv_results['best_params']}")
+        print("\nTest set scores:")
+        for metric, score in test_metrics.items():
+            print(f"{metric}: {score:.4f}")
+
+    # Save optimization results
+    os.makedirs('artifacts/optimization', exist_ok=True)
+    with open('artifacts/optimization/optimization_results.json', 'w') as f:
+        json.dump(optimization_results, f, indent=4)
+    
+    print("\nOptimization completed! Results saved to artifacts/optimization/optimization_results.json")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/projects/prediction/Smoking Prediction/model_training.py b/projects/prediction/Smoking Prediction/model_training.py
new file mode 100644
index 000000000..b864754af
--- /dev/null
+++ b/projects/prediction/Smoking Prediction/model_training.py	
@@ -0,0 +1,112 @@
+
+#? STAGE 4: MODEL TRAINING
+
+import os
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from xgboost import XGBClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+
+# Paths to datasets
+dataset_paths = {
+    'ml_olympiad': {
+        'train': 'Y:/SmokingML V2/data/processed/ml_olympiad_train.csv',
+        'test': 'Y:/SmokingML V2/data/processed/ml_olympiad_test.csv'
+    },
+    'archive': {
+        'train': 'Y:/SmokingML V2/data/processed/archive_train.csv',
+        'test': 'Y:/SmokingML V2/data/processed/archive_test.csv'
+    }
+}
+
+# Directory to save models
+os.makedirs("models", exist_ok=True)
+
+# Best performing models for each dataset
+models = {
+    'ml_olympiad': {
+        'name': 'XGBoost',
+        'model': XGBClassifier(
+            random_state=42,
+            eval_metric='logloss',
+            enable_categorical=False,  # Modern replacement for use_label_encoder
+            verbosity=1
+        )
+    },
+    'archive': {
+        'name': 'XGBoost',
+        'model': XGBClassifier(
+            random_state=42,
+            eval_metric='logloss',
+            enable_categorical=False,  # Modern replacement for use_label_encoder
+            verbosity=1
+        )
+    }
+}
+
+# Dictionary to store trained models and their metrics
+model_info = {}
+
+for dataset_name, paths in dataset_paths.items():
+    print(f"\nProcessing {dataset_name} dataset...")
+    
+    # Load datasets
+    train_df = pd.read_csv(paths['train'])
+    test_df = pd.read_csv(paths['test'])
+    
+    # Get all features except 'smoking' (target)
+    features = [col for col in train_df.columns if col != 'smoking']
+    
+    # Split into features and target
+    x_train = train_df[features]
+    y_train = train_df['smoking']
+    x_val = test_df[features]
+    y_val = test_df['smoking']
+
+    print(f"\n======= Dataset Info: {dataset_name.replace('_', ' ').title()} =======")
+    print(f"Training data shape: {x_train.shape}")
+    print(f"Number of features: {len(features)}")
+    print("Features:", features)
+    
+    # Store model info
+    model_info[dataset_name] = {
+        'name': models[dataset_name]['name'],
+        'features': features  # Store all features for this dataset
+    }
+    
+    # Get and train the appropriate model
+    model = models[dataset_name]['model']
+    print(f"\nTraining {models[dataset_name]['name']} on {dataset_name} dataset...")
+    model.fit(x_train, y_train)
+
+    # Save model
+    model_filename = f"models/{dataset_name}_{models[dataset_name]['name']}.pkl"
+    joblib.dump(model, model_filename)
+    print(f"Saved {models[dataset_name]['name']} model at {model_filename}")
+
+    # Evaluate model
+    y_pred = model.predict(x_val)
+    accuracy = accuracy_score(y_val, y_pred)
+    print(f"\n{models[dataset_name]['name']} Accuracy on {dataset_name}: {accuracy:.4f}")
+
+    # Store metrics in model_info
+    model_info[dataset_name].update({
+        'accuracy': accuracy,
+        'model_path': model_filename
+    })
+
+    # Extra Evaluation Metrics
+    print(f"\nConfusion Matrix ({models[dataset_name]['name']} - {dataset_name}):")
+    print(confusion_matrix(y_val, y_pred))
+
+    print(f"\nClassification Report ({models[dataset_name]['name']} - {dataset_name}):")
+    print(classification_report(y_val, y_pred))
+
+# Save model information for use in evaluation and API
+model_info_path = "artifacts/models/model_info.json"
+os.makedirs(os.path.dirname(model_info_path), exist_ok=True)
+with open(model_info_path, 'w') as f:
+    import json
+    json.dump(model_info, f, indent=4)
diff --git a/projects/prediction/SmokingPredictionModel b/projects/prediction/SmokingPredictionModel
deleted file mode 160000
index 5f8875eb0..000000000
--- a/projects/prediction/SmokingPredictionModel
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5f8875eb0244aca5d1d0d3b43ba54d20e61ad396