diff --git a/projects/prediction/Smoking Prediction/__init__.py b/projects/prediction/Smoking Prediction/__init__.py new file mode 100644 index 000000000..3f0d20d25 --- /dev/null +++ b/projects/prediction/Smoking Prediction/__init__.py @@ -0,0 +1,3 @@ +""" +Components module initialization +""" \ No newline at end of file diff --git a/projects/prediction/Smoking Prediction/data_ingestion.py b/projects/prediction/Smoking Prediction/data_ingestion.py new file mode 100644 index 000000000..cf6ef6318 --- /dev/null +++ b/projects/prediction/Smoking Prediction/data_ingestion.py @@ -0,0 +1,58 @@ + +#? STAGE 1: DATA INGESTION + +import os +import pandas as pd +from sklearn.model_selection import train_test_split + +pd.set_option('display.max_columns', None) + +class DataIngestion: + def __init__(self,dataset_paths): + """ + dataset_paths: dictionary containing dataset paths as keys and their paths as values + Example: + { + "dataset1": {"train": "path/to/dataset1_train.csv", "test": "path/to/dataset1_test.csv"} + "dataset2": {"train": "path/to/dataset2_train.csv", "test": "path/to/dataset2_test.csv"} + } + """ + self.dataset_paths = dataset_paths + + def load_data(self): + datasets = {} + for dataset_name, paths in self.dataset_paths.items(): + # Load training data + train_df = pd.read_csv(paths["train"]) + + # Split into train and test + train_data, test_data = train_test_split( + train_df, test_size=0.2, random_state=42 + ) + + # Store in nested structure + datasets[dataset_name] = { + "train": train_data, + "test": test_data + } + + return datasets + +dataset_paths = { + "ml-olympiad-smoking": { + "train": "Y:/SmokingML V2/data/raw/ml-olympiad-smoking/train.csv" + }, + "archive": { + "train": "Y:/SmokingML V2/data/raw/archive/train_dataset.csv" + } +} + +# Create data ingestion object and load data +data_ingestion = DataIngestion(dataset_paths) +datasets = data_ingestion.load_data() + +# Now we can safely access the train/test splits +print("ML Olympiad Training Data Type:", type(datasets["ml-olympiad-smoking"]["train"])) +print("ML Olympiad Training Data Shape:", datasets["ml-olympiad-smoking"]["train"].shape) +print("Archive Training Data Type:", type(datasets["archive"]["train"])) +print("Archive Training Data Shape:", datasets["archive"]["train"].shape) diff --git a/projects/prediction/Smoking Prediction/data_preprocessing.py b/projects/prediction/Smoking Prediction/data_preprocessing.py new file mode 100644 index 000000000..9894b0674 --- /dev/null +++ b/projects/prediction/Smoking Prediction/data_preprocessing.py @@ -0,0 +1,224 @@ + +#? STAGE 2: DATA PREPROCESSING + +#* Importing dependencies +import pandas as pd +import numpy as np +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.feature_selection import VarianceThreshold, mutual_info_classif +from sklearn.decomposition import PCA +from sklearn.model_selection import train_test_split +import seaborn as sns +import matplotlib.pyplot as plt +import os +from src.components.data_ingestion import datasets + + +#* Define Preprocessing Function +def preprocess_data(train_df, test_df): + # Store target variable + train_target = train_df['smoking'] + test_target = test_df['smoking'] + + # Remove target from features + train_features = train_df.drop('smoking', axis=1) + test_features = test_df.drop('smoking', axis=1) + + # Get numeric columns excluding target + num_cols = train_features.select_dtypes(include=['int64', 'float64']).columns.tolist() + + # Handle missing values for numeric columns + imputer = SimpleImputer(strategy='mean') + train_features[num_cols] = imputer.fit_transform(train_features[num_cols]) + test_features[num_cols] = imputer.transform(test_features[num_cols]) + + # Handle categorical values + cat_cols = train_features.select_dtypes(include=['object']).columns + encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) + + # Encode categorical columns + if len(cat_cols) > 0: + train_encoded = pd.DataFrame( + encoder.fit_transform(train_features[cat_cols]), + index=train_features.index, + columns=encoder.get_feature_names_out(cat_cols) + ) + test_encoded = pd.DataFrame( + encoder.transform(test_features[cat_cols]), + index=test_features.index, + columns=encoder.get_feature_names_out(cat_cols) + ) + + # Drop original categorical columns and reset index + train_features = train_features.drop(cat_cols, axis=1) + test_features = test_features.drop(cat_cols, axis=1) + + # Concatenate encoded features + train_features = pd.concat([train_features, train_encoded], axis=1) + test_features = pd.concat([test_features, test_encoded], axis=1) + + # Feature Scaling - only scale numeric columns + scaler = StandardScaler() + train_features[num_cols] = scaler.fit_transform(train_features[num_cols]) + test_features[num_cols] = scaler.transform(test_features[num_cols]) + + # Split features and target + X = train_features + y = train_target + + # Split training data into train and validation sets + x_train, x_val, y_train, y_val = train_test_split( + X, y, + test_size=0.2, + random_state=42 + ) + + # Store selected features + selected_features = x_train.columns.tolist() + + # Return all 5 expected values + return x_train, x_val, y_train, y_val, selected_features + + +def remove_low_variance_features(train_df, test_df, threshold=0.01): + train_target = train_df['smoking'] if 'smoking' in train_df.columns else None + train_features = train_df.drop('smoking', axis=1) if 'smoking' in train_df.columns else train_df + + test_target = test_df['smoking'] if 'smoking' in test_df.columns else None + test_features = test_df.drop('smoking', axis=1) if 'smoking' in test_df.columns else test_df + + selector = VarianceThreshold(threshold) + train_features_var = selector.fit_transform(train_features) + test_features_var = selector.transform(test_features) + + selected_columns = train_features.columns[selector.get_support()] + + train_selected = pd.DataFrame(train_features_var, columns=selected_columns, index=train_df.index) + test_selected = pd.DataFrame(test_features_var, columns=selected_columns, index=test_df.index) + + if train_target is not None: + train_selected['smoking'] = train_target + if test_target is not None: + test_selected['smoking'] = test_target + + return train_selected, test_selected + + +def remove_highly_correlated_features(train_df, test_df, threshold=0.9): + correlation_matrix = train_df.corr() + upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)) + drop_cols = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)] + return train_df.drop(columns=drop_cols), test_df.drop(columns=drop_cols) + + +def select_features_by_mutual_info(train_df, test_df, target_column, num_features=15): + X = train_df.drop(columns=[target_column]) + y = train_df[target_column] + + mutual_info = mutual_info_classif(X, y, discrete_features='auto') + feature_scores = pd.Series(mutual_info, index=X.columns) + selected_features = feature_scores.nlargest(num_features).index.to_list() + + if target_column in test_df.columns: + return train_df[selected_features + [target_column]], test_df[selected_features + [target_column]] + else: + return train_df[selected_features + [target_column]], test_df[selected_features] + + +def apply_pca(train_df, test_df, n_components=10): + pca = PCA(n_components=n_components) + train_pca = pca.fit_transform(train_df) + test_pca = pca.transform(test_df) + return pd.DataFrame(train_pca), pd.DataFrame(test_pca) + + +if __name__ == "__main__": + #* Load both Train and Test Datasets + train_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["train"]) + test_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["test"]) + train_archive = pd.DataFrame(datasets["archive"]["train"]) + test_archive = pd.DataFrame(datasets["archive"]["test"]) + + print("DISPLAY BASIC INFORMATION") + print("ML Olympiad Train Data Shape:", train_ml.shape) + print("ML Olympiad Test Data Shape:", test_ml.shape) + print(train_ml.head()) + print("Archive Train Data Shape:", train_archive.shape) + print("Archive Test Data Shape:", test_archive.shape) + print(test_archive.head()) + + #* Apply Preprocessing to all datasets + x_train_ml, x_val_ml, y_train_ml, y_val_ml, selected_features_ml = preprocess_data(train_ml, test_ml) + x_train_archive, x_val_archive, y_train_archive, y_val_archive, selected_features_archive = preprocess_data(train_archive, test_archive) + + preprocessed_data_paths = { + "ml-olympiad-smoking": { + "train": "Y:/SmokingML V2/data/processed/ml_olympiad_train.csv", + "test": "Y:/SmokingML V2/data/processed/ml_olympiad_test.csv" + }, + "archive": { + "train": "Y:/SmokingML V2/data/processed/archive_train.csv", + "test": "Y:/SmokingML V2/data/processed/archive_test.csv" + } + } + + for dataset_name, paths in preprocessed_data_paths.items(): + for key, path in paths.items(): + os.makedirs(os.path.dirname(path), exist_ok=True) + + pd.concat([x_train_ml, y_train_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["train"], index=False) + pd.concat([x_val_ml, y_val_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["test"], index=False) + pd.concat([x_train_archive, y_train_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["train"], index=False) + pd.concat([x_val_archive, y_val_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["test"], index=False) + + print("Preprocessed data has been saved successfully!") + + #* Variance Thresholding + preprocessed_train_ml, preprocessed_test_ml = remove_low_variance_features(pd.concat([x_train_ml, y_train_ml], axis=1), pd.concat([x_val_ml, y_val_ml], axis=1)) + preprocessed_train_archive, preprocessed_test_archive = remove_low_variance_features(pd.concat([x_train_archive, y_train_archive], axis=1), pd.concat([x_val_archive, y_val_archive], axis=1)) + + #* Feature Selection + preprocessed_train_ml, preprocessed_test_ml = select_features_by_mutual_info(preprocessed_train_ml, preprocessed_test_ml, target_column='smoking') + preprocessed_train_archive, preprocessed_test_archive = select_features_by_mutual_info(preprocessed_train_archive, preprocessed_test_archive, target_column='smoking') + + #* ✅ Optional assertion checks + assert 'smoking' in preprocessed_train_ml.columns, "Target column 'smoking' missing in training set!" + assert 'smoking' in preprocessed_test_ml.columns, "Target column 'smoking' missing in test set!" + assert 'smoking' in preprocessed_train_archive.columns, "Target column 'smoking' missing in archive training set!" + assert 'smoking' in preprocessed_test_archive.columns, "Target column 'smoking' missing in archive test set!" + + #* ✅ Debug: Show absolute save paths + print("\n✅ Saving preprocessed files to:") + print("ML Train Path :", os.path.abspath("Y:/SmokingML V2/data/processed/train_ml.csv")) + print("ML Test Path :", os.path.abspath("Y:/SmokingML V2/data/processed/test_ml.csv")) + print("Archive Train Path :", os.path.abspath("Y:/SmokingML V2/data/processed/train_archive.csv")) + print("Archive Test Path :", os.path.abspath("Y:/SmokingML V2/data/processed/test_archive.csv")) + + #* Save final preprocessed files + preprocessed_train_ml.to_csv("Y:/SmokingML V2/data/processed/train_ml.csv", index=False) + preprocessed_test_ml.to_csv("Y:/SmokingML V2/data/processed/test_ml.csv", index=False) + preprocessed_train_archive.to_csv("Y:/SmokingML V2/data/processed/train_archive.csv", index=False) + preprocessed_test_archive.to_csv("Y:/SmokingML V2/data/processed/test_archive.csv", index=False) + + print("Feature Engineering and Selection completed Successfully!") + + + import json + + #* Save selected features to JSON for both datasets + selected_features_dir = "Y:/SmokingML V2/artifacts/models" + os.makedirs(selected_features_dir, exist_ok=True) + + # Remove 'smoking' from selected columns before saving (optional based on use-case) + selected_columns_olympiad = [col for col in preprocessed_train_ml.columns if col != 'smoking'] + selected_columns_archive = [col for col in preprocessed_train_archive.columns if col != 'smoking'] + + # Save to JSON + with open(os.path.join(selected_features_dir, "feature_columns_olympiad.json"), "w") as f: + json.dump(selected_columns_olympiad, f, indent=4) + + with open(os.path.join(selected_features_dir, "feature_columns_archive.json"), "w") as f: + json.dump(selected_columns_archive, f, indent=4) + + print("✅ Feature columns JSON files saved successfully!") diff --git a/projects/prediction/Smoking Prediction/feature_engineering.py b/projects/prediction/Smoking Prediction/feature_engineering.py new file mode 100644 index 000000000..f71554373 --- /dev/null +++ b/projects/prediction/Smoking Prediction/feature_engineering.py @@ -0,0 +1,106 @@ + +#? STAGE 3: FEATURE ENGINEERING + +import json +import numpy as np +import pandas as pd +from typing import Dict, List +from sklearn.preprocessing import PolynomialFeatures +from pathlib import Path + +class FeatureEngineer: + def __init__(self): + self.rules = self._load_rules() + + def _load_rules(self) -> Dict: + """Load feature engineering rules from config file""" + config_path = Path("config/feature_engineering_rules.json") + if not config_path.exists(): + return self._get_default_rules() + + with open(config_path, "r") as f: + return json.load(f) + + def _get_default_rules(self) -> Dict: + """Default feature engineering rules if no config exists""" + return { + "health_indicators": [ + { + "name": "bmi_health_index", + "formula": "weight / (height ** 2)", + "enabled": True, + "description": "BMI-based health indicator" + } + ], + "polynomial_features": ["age", "weight", "height"], + "feature_ratios": [ + { + "name": "age_bmi_ratio", + "formula": "age / bmi_health_index", + "enabled": True + } + ], + "polynomial_degree": 2 + } + + def create_health_indicators(self, df: pd.DataFrame) -> pd.DataFrame: + """Generate health indicator features based on configured rules""" + result = df.copy() + + for rule in self.rules["health_indicators"]: + if rule["enabled"]: + try: + result[rule["name"]] = eval(rule["formula"], + {"__builtins__": None}, + {**dict(result), "np": np}) + except Exception as e: + print(f"Failed to calculate {rule['name']}: {str(e)}") + + return result + + def create_polynomial_features(self, df: pd.DataFrame) -> pd.DataFrame: + """Generate polynomial features for specified columns""" + result = df.copy() + features_to_transform = [col for col in self.rules["polynomial_features"] + if col in df.columns] + + if not features_to_transform: + return result + + poly = PolynomialFeatures( + degree=self.rules["polynomial_degree"], + include_bias=False + ) + + poly_features = poly.fit_transform(df[features_to_transform]) + feature_names = poly.get_feature_names_out(features_to_transform) + + # Add only the interaction terms and higher degree terms + for i, name in enumerate(feature_names[len(features_to_transform):], + start=len(features_to_transform)): + result[f"poly_{name}"] = poly_features[:, i] + + return result + + def create_feature_ratios(self, df: pd.DataFrame) -> pd.DataFrame: + """Generate feature ratios based on configured rules""" + result = df.copy() + + for rule in self.rules["feature_ratios"]: + if rule["enabled"]: + try: + result[rule["name"]] = eval(rule["formula"], + {"__builtins__": None}, + {**dict(result), "np": np}) + except Exception as e: + print(f"Failed to calculate {rule['name']}: {str(e)}") + + return result + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Apply all feature engineering transformations""" + result = df.copy() + result = self.create_health_indicators(result) + result = self.create_polynomial_features(result) + result = self.create_feature_ratios(result) + return result \ No newline at end of file diff --git a/projects/prediction/Smoking Prediction/model_deployment.py b/projects/prediction/Smoking Prediction/model_deployment.py new file mode 100644 index 000000000..93ac94cff --- /dev/null +++ b/projects/prediction/Smoking Prediction/model_deployment.py @@ -0,0 +1,847 @@ + +#? STAGE 8: MODEL DEPLOYMENT + +from fastapi import FastAPI, HTTPException, Path, Body +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field +import pandas as pd +import joblib +import os +import sys +import logging +from datetime import datetime +from dotenv import load_dotenv +from fastapi.openapi.utils import get_openapi +from sklearn.ensemble import VotingClassifier +from sklearn.preprocessing import PolynomialFeatures +from typing import Optional, List, Dict +from contextlib import asynccontextmanager +import socket +import uvicorn +from fastapi.openapi.docs import get_swagger_ui_html +from fastapi.responses import HTMLResponse +import json +from .feature_engineering import FeatureEngineer + +# Configure logging to both file and console with maximum verbosity +LOG_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'logs') +API_LOG_DIR = os.path.join(LOG_DIR, 'api') +DEPLOYMENT_LOG_DIR = os.path.join(LOG_DIR, 'deployment') +ERROR_LOG_DIR = os.path.join(LOG_DIR, 'errors') + +# Create log directories if they don't exist +os.makedirs(LOG_DIR, exist_ok=True) +os.makedirs(API_LOG_DIR, exist_ok=True) +os.makedirs(DEPLOYMENT_LOG_DIR, exist_ok=True) +os.makedirs(ERROR_LOG_DIR, exist_ok=True) + +# Configure logging with organized file structure +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(os.path.join(API_LOG_DIR, f'api_{datetime.now().strftime("%Y%m%d")}.log')), + logging.FileHandler(os.path.join(DEPLOYMENT_LOG_DIR, f'deployment_{datetime.now().strftime("%Y%m%d")}.log')), + logging.StreamHandler(sys.stdout) + ] +) + +# Configure error logging separately +error_handler = logging.FileHandler(os.path.join(ERROR_LOG_DIR, f'error_{datetime.now().strftime("%Y%m%d")}.log')) +error_handler.setLevel(logging.ERROR) +error_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +logging.getLogger().addHandler(error_handler) + +logger = logging.getLogger(__name__) + +# Load environment variables +load_dotenv() + +# Define model path and dictionary to hold loaded models +MODEL_PATH = os.getenv("MODEL_PATH", os.path.abspath(os.path.join(os.path.dirname(__file__), "../../models"))) +logger.info(f"Using model path: {MODEL_PATH}") +models = {} +model_parameters = {} + +# Define best models to be loaded for deployment +BEST_MODELS = { + "ml_olympiad_improved_final": "ML Olympiad – Improved XGBoost", + "archive_improved_final": "Archive – Improved Ensemble" +} + +# Default model parameters +DEFAULT_MODEL_PARAMETERS = { + "confidence_threshold": 0.5, + "class_weights": {"0": 1.0, "1": 1.0}, + "health_indicator_thresholds": { + "bmi": {"low": 18.5, "high": 25.0}, + "liver_function": {"low": 10.0, "high": 50.0}, + "cardiovascular_risk": {"low": 1.0, "high": 5.0}, + "metabolic_index": {"low": 0.5, "high": 2.5} + } +} + +class ModelParameters(BaseModel): + confidence_threshold: Optional[float] = Field(0.5, ge=0.0, le=1.0) + class_weights: Optional[Dict[str, float]] = Field( + default_factory=lambda: {"0": 1.0, "1": 1.0} + ) + health_indicator_thresholds: Optional[Dict[str, Dict[str, float]]] = Field( + default_factory=lambda: DEFAULT_MODEL_PARAMETERS["health_indicator_thresholds"] + ) + + class Config: + json_schema_extra = { + "example": DEFAULT_MODEL_PARAMETERS + } + +# Define lifespan to load models and handle startup logging +@asynccontextmanager +async def lifespan(app: FastAPI): + try: + logger.info(f"Starting model loading from {MODEL_PATH}") + if not os.path.exists(MODEL_PATH): + error_msg = f"Model directory not found at {MODEL_PATH}" + logger.error(error_msg) + raise Exception(error_msg) + + model_files = [f for f in os.listdir(MODEL_PATH) if f.endswith('.pkl')] + logger.info(f"Found model files: {model_files}") + + if not model_files: + error_msg = f"No .pkl model files found in {MODEL_PATH}" + logger.error(error_msg) + raise Exception(error_msg) + + for model_file in model_files: + model_name = model_file.replace('.pkl', '') + if model_name in BEST_MODELS: + model_path = os.path.join(MODEL_PATH, model_file) + try: + logger.info(f"Loading model {model_name} from {model_path}") + model_artifacts = joblib.load(model_path) + models[model_name] = model_artifacts['model'] + logger.info(f"Successfully loaded model: {model_name}") + except Exception as e: + logger.error(f"Error loading model {model_name}: {str(e)}", exc_info=True) + raise + + if not models: + error_msg = f"No best models found for deployment in {MODEL_PATH}. Expected models: {list(BEST_MODELS.keys())}" + logger.error(error_msg) + raise Exception(error_msg) + + # Startup logging + logger.info("=== Server Starting ===") + logger.info(get_ip()) + logger.info("You can access the API at:") + logger.info(" http://127.0.0.1:8000") + logger.info(" http://localhost:8000") + logger.info("API documentation available at:") + logger.info(" http://127.0.0.1:8000/docs") + logger.info(" http://localhost:8000/docs") + logger.info("Try both URLs if one doesn't work") + + logger.info("All models loaded successfully. Ready to serve.") + except Exception as e: + logger.error(f"Error during startup: {str(e)}", exc_info=True) + raise e + yield + # Cleanup + logger.info("Cleaning up models") + models.clear() + +# Initialize FastAPI app with lifespan +app = FastAPI( + lifespan=lifespan, + title="Smoking Status Prediction API", + description="API for predicting smoking status using machine learning models", + version="2.0.0", + docs_url=None, + redoc_url=None +) + +# Update CORS middleware with more specific origins and headers +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Allow all origins for testing - restrict this in production + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + expose_headers=["*"] +) + +# Add socket info logging +def get_ip(): + try: + # Get all network interfaces + hostname = socket.gethostname() + local_ip = socket.gethostbyname(hostname) + return f"Hostname: {hostname}, Local IP: {local_ip}" + except Exception as e: + return f"Could not determine IP: {str(e)}" + +# Custom OpenAPI schema +def custom_openapi(): + if app.openapi_schema: + return app.openapi_schema + + openapi_schema = get_openapi( + title="Smoking Status Prediction API", + version="2.0.0", + description="**API for predicting smoking status using best-performing or ensemble ML models**", + routes=app.routes, + ) + + # Define tags with descriptions and colors + openapi_schema["tags"] = [ + { + "name": "Root", + "description": "**Root endpoint operations**", + "x-tag-style": {"background-color": "#FFEB3B"} + }, + { + "name": "Models", + "description": "**Model listing operations**", + "x-tag-style": {"background-color": "#FF69B4"} + }, + { + "name": "Health", + "description": "**Health check operations**", + "x-tag-style": {"background-color": "#4CAF50"} + }, + { + "name": "Predictions", + "description": "**Smoking status prediction operations**", + "x-tag-style": {"background-color": "#2196F3"} + }, + { + "name": "Feature Engineering", + "description": "**Feature engineering rules management**", + "x-tag-style": {"background-color": "#9C27B0"} + } + ] + + app.openapi_schema = openapi_schema + return app.openapi_schema + +app.openapi = custom_openapi + +# Define input schema +class SmokingPredictionInput(BaseModel): + height_cm: float = Field(..., alias="height(cm)") + weight_kg: float = Field(..., alias="weight(kg)") + waist_cm: float = Field(..., alias="waist(cm)") + age: float + ALT: float + Gtp: float + HDL: float + LDL: float = Field(0.0) + Cholesterol: float = Field(0.0) + systolic: float + relaxation: float + hemoglobin: float + serum_creatinine: float = Field(..., alias="serum creatinine") + triglyceride: float + AST: Optional[float] = Field(0.0) + dental_caries: Optional[int] = Field(0, alias="dental caries") + eyesight_right: Optional[float] = Field(0.0, alias="eyesight(right)") + eyesight_left: Optional[float] = Field(0.0, alias="eyesight(left)") + fasting_blood_sugar: Optional[float] = Field(0.0, alias="fasting blood sugar") + + class Config: + populate_by_name = True + json_schema_extra = { + "example": { + "height(cm)": 170.0, + "weight(kg)": 70.0, + "waist(cm)": 85.0, + "eyesight(left)": 1.0, + "eyesight(right)": 1.0, + "age": 35.0, + "ALT": 25.0, + "AST": 20.0, + "Gtp": 30.0, + "HDL": 50.0, + "LDL": 100.0, + "Cholesterol": 180.0, + "dental caries": 0, + "fasting blood sugar": 90.0, + "relaxation": 80.0, + "serum creatinine": 1.0, + "triglyceride": 150.0, + "hemoglobin": 15.0, + "systolic": 120.0 + } + } + +# Root endpoint with enhanced response +@app.get("/", tags=["Root"], response_model=dict) +async def root(): + """Root endpoint with detailed API information and status""" + try: + network_info = get_ip() + logger.info(f"Root endpoint accessed. {network_info}") + + response_data = { + "status": "success", + "api_info": { + "name": "Enhanced Smoking Prediction API", + "version": "2.0.0", + "description": "Machine Learning API for Smoking Status Prediction" + }, + "models": { + "available": list(models.keys()), + "total_count": len(models), + "model_path": MODEL_PATH + }, + "endpoints": { + "documentation": "/docs", + "health_check": "/health", + "models_list": "/models", + "prediction": "/predict/{model_name}" + }, + "server_info": { + "status": "healthy", + "network": network_info, + "timestamp": datetime.now().isoformat() + } + } + + return response_data + + except Exception as e: + error_msg = f"Error accessing root endpoint: {str(e)}" + logger.error(error_msg) + raise HTTPException(status_code=500, detail={"error": error_msg}) + +# Health check endpoint +@app.get("/health", tags=["Health"]) +async def health_check(): + logger.info("Health check endpoint accessed") + return { + "status": "healthy", + "models_loaded": list(models.keys()), + "model_path": MODEL_PATH, + "timestamp": datetime.now().isoformat() + } + +# Endpoint to list models +@app.get("/models", tags=["Models"]) +async def list_models(): + logger.info("Models endpoint accessed") + return { + "available_models": BEST_MODELS, + "loaded_models": list(models.keys()), + "total": len(BEST_MODELS), + "model_path": MODEL_PATH + } + +# Prediction endpoint +@app.post("/predict/{model_name}", tags=["Predictions"]) +async def predict( + model_name: str = Path( + ..., + description="Available models: ml_olympiad_improved_final, archive_improved_final" + ), + input_data: SmokingPredictionInput = Body(...) +): + logger.info(f"Prediction requested for model: {model_name}") + try: + # Clean up model name + model_name = model_name.strip() + + if model_name not in models: + error_msg = f"Model '{model_name}' not found. Available models: {list(models.keys())}" + logger.error(error_msg) + raise HTTPException(status_code=404, detail={"error": error_msg}) + + # Get model parameters or use defaults + model_params = model_parameters.get(model_name, DEFAULT_MODEL_PARAMETERS) + confidence_threshold = model_params["confidence_threshold"] + health_thresholds = model_params["health_indicator_thresholds"] + + # Convert input data to DataFrame + input_dict = input_data.dict(by_alias=True) + logger.debug(f"Raw input data: {input_dict}") + data = pd.DataFrame([input_dict]) + + try: + # 1. Initialize all required numeric columns with safe defaults + default_values = { + 'systolic': data.get('systolic', [0.0])[0], + 'triglyceride': data.get('triglyceride', [0.0])[0], + 'HDL': max(data.get('HDL', [1.0])[0], 1.0), # Ensure HDL is at least 1 + 'LDL': data.get('LDL', [0.0])[0], + 'AST': data.get('AST', [0.0])[0], + 'ALT': data.get('ALT', [0.0])[0], + 'Gtp': data.get('Gtp', [0.0])[0], + 'fasting blood sugar': data.get('fasting blood sugar', [0.0])[0] + } + + # Update DataFrame with safe values + for col, value in default_values.items(): + if pd.isna(value): + data[col] = 0.0 if col != 'HDL' else 1.0 + else: + data[col] = value + + logger.debug("Initialized features with safe values") + + # 2. Calculate basic health indicators + data['bmi'] = data['weight(kg)'] / ((data['height(cm)']/100) ** 2) + data['liver_function'] = (data['AST'] + data['ALT'] + data['Gtp']) / 3 + data['cardiovascular_risk'] = (data['systolic'] * data['triglyceride']) / data['HDL'] + data['metabolic_index'] = (data['fasting blood sugar'] * data['bmi']) / data['HDL'] + + # 3. Calculate health status indicators + data['bmi_status'] = ((data['bmi'] >= health_thresholds['bmi']['low']) & + (data['bmi'] <= health_thresholds['bmi']['high'])).astype(int) + + data['liver_status'] = ((data['liver_function'] >= health_thresholds['liver_function']['low']) & + (data['liver_function'] <= health_thresholds['liver_function']['high'])).astype(int) + + data['cv_risk_status'] = ((data['cardiovascular_risk'] >= health_thresholds['cardiovascular_risk']['low']) & + (data['cardiovascular_risk'] <= health_thresholds['cardiovascular_risk']['high'])).astype(int) + + data['metabolic_status'] = ((data['metabolic_index'] >= health_thresholds['metabolic_index']['low']) & + (data['metabolic_index'] <= health_thresholds['metabolic_index']['high'])).astype(int) + + # 4. Calculate additional ratios + data['hdl_ldl_ratio'] = data['HDL'] / (data['LDL'] + 1) + data['ast_alt_ratio'] = data['AST'] / (data['ALT'] + 1) + data['bp_ratio'] = data['systolic'] / (data['relaxation'] + 1) + + # 5. Generate polynomial features based on model type + if model_name == 'ml_olympiad_improved_final': + key_features = ['bmi', 'liver_function', 'cardiovascular_risk', 'metabolic_index'] + poly = PolynomialFeatures(degree=2, include_bias=False) + poly_features = poly.fit_transform(data[key_features]) + for i in range(poly_features.shape[1]): + data[f'health_poly_{i}'] = poly_features[:, i] + else: # archive_improved_final + # For archive model, we only need specific polynomial features + key_features = ['bmi', 'liver_function', 'cardiovascular_risk'] + poly = PolynomialFeatures(degree=2, include_bias=False) + poly_features = poly.fit_transform(data[key_features]) + # Only keep required polynomial features (0, 4, 5) + data['health_poly_0'] = poly_features[:, 0] # First feature + data['health_poly_4'] = poly_features[:, 4] # Fifth feature + data['health_poly_5'] = poly_features[:, 5] # Sixth feature + + logger.debug("All features calculated successfully") + logger.debug(f"Available features: {list(data.columns)}") + + except Exception as e: + error_msg = f"Error calculating health indicators: {str(e)}" + logger.error(error_msg) + logger.error(f"Data state: {data.to_dict()}") + raise HTTPException(status_code=400, detail={"error": error_msg}) + + # Select features based on model type + if model_name == 'ml_olympiad_improved_final': + required_features = [ + "age", "height(cm)", "weight(kg)", "systolic", "relaxation", + "Cholesterol", "triglyceride", "HDL", "LDL", "hemoglobin", + "serum creatinine", "AST", "ALT", "Gtp", "dental caries", + "health_poly_0", "health_poly_1", "health_poly_4", "health_poly_13", + "bmi", "liver_function", "hdl_ldl_ratio", "ast_alt_ratio" + ] + else: # archive_improved_final + required_features = [ + "age", "height(cm)", "weight(kg)", "waist(cm)", "systolic", + "relaxation", "fasting blood sugar", "triglyceride", "HDL", + "LDL", "hemoglobin", "serum creatinine", "ALT", "Gtp", + "dental caries", "health_poly_0", "health_poly_4", "health_poly_5", + "bmi", "liver_function", "hdl_ldl_ratio", "ast_alt_ratio" + ] + + # Create a new DataFrame with only required features in correct order + prediction_data = pd.DataFrame() + for feature in required_features: + if feature not in data.columns: + error_msg = f"Missing required feature: {feature}" + logger.error(error_msg) + raise HTTPException(status_code=400, detail={"error": error_msg}) + prediction_data[feature] = data[feature] + + logger.debug(f"Final features for prediction: {list(prediction_data.columns)}") + + # Make prediction + model = models[model_name] + prediction = model.predict(prediction_data)[0] + probabilities = model.predict_proba(prediction_data)[0] + confidence = float(max(probabilities)) + + # Apply confidence threshold + adjusted_prediction = 1 if confidence >= confidence_threshold and prediction == 1 else 0 + + result = { + "model_used": BEST_MODELS[model_name], + "prediction": int(adjusted_prediction), + "label": "Smoker" if adjusted_prediction == 1 else "Non-smoker", + "confidence": f"{confidence:.2%}", + "confidence_threshold": confidence_threshold, + "health_indicators": { + "bmi_status": bool(data['bmi'].iloc[0] >= health_thresholds['bmi']['low'] and + data['bmi'].iloc[0] <= health_thresholds['bmi']['high']), + "liver_status": bool(data['liver_function'].iloc[0] >= health_thresholds['liver_function']['low'] and + data['liver_function'].iloc[0] <= health_thresholds['liver_function']['high']), + "cardiovascular_status": bool(data['cardiovascular_risk'].iloc[0] >= health_thresholds['cardiovascular_risk']['low'] and + data['cardiovascular_risk'].iloc[0] <= health_thresholds['cardiovascular_risk']['high']), + "metabolic_status": bool(data['metabolic_index'].iloc[0] >= health_thresholds['metabolic_index']['low'] and + data['metabolic_index'].iloc[0] <= health_thresholds['metabolic_index']['high']) + }, + "calculated_features": { + "bmi": float(data['bmi'].iloc[0]), + "liver_function": float(data['liver_function'].iloc[0]), + "cardiovascular_risk": float(data['cardiovascular_risk'].iloc[0]), + "metabolic_index": float(data['metabolic_index'].iloc[0]) + }, + "model_type": "XGBoost" if model_name == "ml_olympiad_improved_final" else "Ensemble", + "features_used": required_features + } + + return result + + except HTTPException: + raise + except Exception as e: + error_msg = f"Error making prediction: {str(e)}" + logger.error(error_msg) + logger.error("Full traceback: ", exc_info=True) + raise HTTPException(status_code=500, detail={"error": error_msg}) + +# Feature engineering rules models +class FeatureEngineeringRule(BaseModel): + name: str + formula: str + enabled: bool = True + description: Optional[str] = None + degree: Optional[int] = Field(default=2, ge=1, le=3) + +class FeatureEngineeringRules(BaseModel): + health_indicators: List[FeatureEngineeringRule] + polynomial_features: List[str] + feature_ratios: List[FeatureEngineeringRule] + polynomial_degree: int = Field(default=2, ge=1, le=3) + +# Endpoint to update feature engineering rules +@app.put("/feature-engineering/rules", tags=["Feature Engineering"]) +async def update_feature_engineering_rules(rules: FeatureEngineeringRules): + """ + Update feature engineering rules including: + - Health indicator calculations + - Polynomial feature generation rules + - Feature ratio calculations + """ + try: + # Save the rules to a configuration file + rules_dict = rules.dict() + os.makedirs("config", exist_ok=True) + with open("config/feature_engineering_rules.json", "w") as f: + json.dump(rules_dict, f, indent=4) + + return { + "status": "success", + "message": "Feature engineering rules updated successfully", + "rules": rules_dict + } + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Failed to update feature engineering rules: {str(e)}" + ) + +@app.get("/docs", include_in_schema=False) +async def custom_swagger_ui_html(): + html_content = """ + + + + + + Smoking Status Prediction API - Swagger UI + + + +
+ + + + + """ + return HTMLResponse(content=html_content) + + +class ModelDeployment: + def __init__(self): + self.feature_engineer = FeatureEngineer() + self.model = models.get('smoking_status', None) # Assuming models is defined elsewhere + + async def predict(self, data: Dict): + """Make predictions using the deployed model""" + try: + # Convert input data to DataFrame + df = pd.DataFrame([data]) + + # Apply feature engineering + df = self.feature_engineer.transform(df) + + # Make prediction + prediction = self.model.predict(df)[0] + probability = self.model.predict_proba(df)[0][1] + + return { + "prediction": int(prediction), + "probability": float(probability), + "status": "success" + } + except Exception as e: + logger.error(f"Prediction failed: {str(e)}") + raise HTTPException( + status_code=500, + detail=f"Prediction failed: {str(e)}" + ) + + +@app.patch("/models/{model_name}/parameters", tags=["Models"]) +async def update_model_parameters( + model_name: str = Path( + ..., + description="Model name to update parameters for" + ), + parameters: ModelParameters = Body(...) +): + """ + Update model parameters including: + - Confidence threshold for predictions + - Class weights for model predictions + - Health indicator thresholds + """ + try: + if model_name not in models: + raise HTTPException( + status_code=404, + detail=f"Model '{model_name}' not found. Available models: {list(models.keys())}" + ) + + # Initialize parameters for model if not exists + if model_name not in model_parameters: + model_parameters[model_name] = DEFAULT_MODEL_PARAMETERS.copy() + + # Update only provided parameters + updated_params = parameters.dict(exclude_unset=True) + model_parameters[model_name].update(updated_params) + + logger.info(f"Updated parameters for model {model_name}: {updated_params}") + + return { + "status": "success", + "message": f"Parameters updated successfully for model: {model_name}", + "model": model_name, + "updated_parameters": updated_params, + "current_parameters": model_parameters[model_name] + } + except HTTPException: + raise + except Exception as e: + error_msg = f"Failed to update model parameters: {str(e)}" + logger.error(error_msg) + raise HTTPException(status_code=500, detail={"error": error_msg}) + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000, reload=True) \ No newline at end of file diff --git a/projects/prediction/Smoking Prediction/model_evaluation.py b/projects/prediction/Smoking Prediction/model_evaluation.py new file mode 100644 index 000000000..129dee5c8 --- /dev/null +++ b/projects/prediction/Smoking Prediction/model_evaluation.py @@ -0,0 +1,213 @@ + +#? STAGE 6: MODEL EVALUATION + + +import os +import json +import joblib +import numpy as np +import pandas as pd +# Set the backend to 'Agg' before importing matplotlib +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import ( + accuracy_score, precision_score, recall_score, f1_score, + roc_auc_score, confusion_matrix, classification_report, + roc_curve, precision_recall_curve +) + +def convert_to_python_types(d): + """Convert NumPy types to native Python types for JSON serialization""" + if isinstance(d, dict): + return {k: convert_to_python_types(v) for k, v in d.items()} + elif isinstance(d, (np.integer)): # Updated for NumPy 2.0+ + return int(d) + elif isinstance(d, (np.floating)): # Updated for NumPy 2.0+ + return float(d) + elif isinstance(d, (np.ndarray, pd.Series)): + return convert_to_python_types(d.tolist()) + elif isinstance(d, list): + return [convert_to_python_types(i) for i in d] + else: + return d + +def evaluate_model(model, X_test, y_test, model_name, features): + """ + Evaluate model performance and generate visualizations + """ + # Make predictions + y_pred = model.predict(X_test) + y_pred_proba = model.predict_proba(X_test)[:, 1] + + # Calculate metrics + metrics = { + 'accuracy': float(accuracy_score(y_test, y_pred)), + 'precision': float(precision_score(y_test, y_pred)), + 'recall': float(recall_score(y_test, y_pred)), + 'f1': float(f1_score(y_test, y_pred)), + 'roc_auc': float(roc_auc_score(y_test, y_pred_proba)) + } + + # Create visualizations directory + os.makedirs('artifacts/visualizations', exist_ok=True) + + # Plot ROC Curve + fpr, tpr, _ = roc_curve(y_test, y_pred_proba) + plt.figure(figsize=(10, 6)) + plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {metrics["roc_auc"]:.2f})') + plt.plot([0, 1], [0, 1], 'k--', label='Random') + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title(f'ROC Curve - {model_name}') + plt.legend() + plt.grid(True) + plt.savefig(f'artifacts/visualizations/roc_curve_{model_name}.png') + plt.close() + + # Plot Confusion Matrix + cm = confusion_matrix(y_test, y_pred) + plt.figure(figsize=(8, 6)) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') + plt.xlabel('Predicted') + plt.ylabel('Actual') + plt.title(f'Confusion Matrix - {model_name}') + plt.savefig(f'artifacts/visualizations/confusion_matrix_{model_name}.png') + plt.close() + + # Enhanced error analysis + errors_df = pd.DataFrame({ + 'Actual': y_test, + 'Predicted': y_pred, + 'Probability': y_pred_proba + }) + errors_df['Error_Type'] = 'Correct' + errors_df.loc[(errors_df['Actual'] == 1) & (errors_df['Predicted'] == 0), 'Error_Type'] = 'False Negative' + errors_df.loc[(errors_df['Actual'] == 0) & (errors_df['Predicted'] == 1), 'Error_Type'] = 'False Positive' + + # Add feature values for error analysis + errors_df = pd.concat([errors_df, X_test.reset_index(drop=True)], axis=1) + + # Save error analysis with converted types + error_analysis = { + 'false_positives': { + 'count': int(len(errors_df[errors_df['Error_Type'] == 'False Positive'])), + 'avg_probability': float(errors_df[errors_df['Error_Type'] == 'False Positive']['Probability'].mean()), + 'feature_means': convert_to_python_types( + errors_df[errors_df['Error_Type'] == 'False Positive'][features].mean().to_dict() + ) + }, + 'false_negatives': { + 'count': int(len(errors_df[errors_df['Error_Type'] == 'False Negative'])), + 'avg_probability': float(errors_df[errors_df['Error_Type'] == 'False Negative']['Probability'].mean()), + 'feature_means': convert_to_python_types( + errors_df[errors_df['Error_Type'] == 'False Negative'][features].mean().to_dict() + ) + } + } + + # Save detailed error analysis + with open(f'artifacts/visualizations/error_analysis_{model_name}.json', 'w') as f: + json.dump(error_analysis, f, indent=4) + + # Plot confusion matrix with percentages + plt.figure(figsize=(10, 8)) + cm_percent = confusion_matrix(y_test, y_pred, normalize='true') * 100 + sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap='Blues', + xticklabels=['Non-Smoker', 'Smoker'], + yticklabels=['Non-Smoker', 'Smoker']) + plt.xlabel('Predicted') + plt.ylabel('Actual') + plt.title(f'Confusion Matrix (%) - {model_name}') + plt.savefig(f'artifacts/visualizations/confusion_matrix_percent_{model_name}.png') + plt.close() + + # Feature Importance Plot (if available) + if hasattr(model, 'feature_importances_'): + importances = pd.DataFrame({ + 'feature': features, + 'importance': [float(i) for i in model.feature_importances_] # Convert to Python float + }).sort_values('importance', ascending=False) + + # Plot top 20 features + plt.figure(figsize=(12, 8)) + top_20_features = importances.head(20) + sns.barplot(data=top_20_features, x='importance', y='feature') + plt.title(f'Top 20 Feature Importance - {model_name}') + plt.xlabel('Importance') + plt.tight_layout() + plt.savefig(f'artifacts/visualizations/feature_importance_{model_name}.png') + plt.close() + + # Save complete feature importance to JSON + importance_dict = {k: float(v) for k, v in importances.set_index('feature')['importance'].to_dict().items()} + with open(f'artifacts/visualizations/feature_importance_{model_name}.json', 'w') as f: + json.dump(importance_dict, f, indent=4) + + return metrics + +def main(): + # Load model information + with open('artifacts/models/model_info.json', 'r') as f: + model_info = json.load(f) + + # Paths to test datasets + dataset_paths = { + 'ml_olympiad': 'Y:/SmokingML V2/data/processed/ml_olympiad_test.csv', + 'archive': 'Y:/SmokingML V2/data/processed/archive_test.csv' + } + + evaluation_results = {} + + for dataset_name, test_path in dataset_paths.items(): + print(f"\nEvaluating model for {dataset_name} dataset...") + + # Load the model + model_path = model_info[dataset_name]['model_path'] + model = joblib.load(model_path) + + # Load test data + test_df = pd.read_csv(test_path) + + # Get dataset-specific features from model info + features = model_info[dataset_name]['features'] + + # Get features and target + X_test = test_df[features] + y_test = test_df['smoking'] + + print(f"Number of features being used for {dataset_name}: {len(features)}") + + # Evaluate model + metrics = evaluate_model( + model, + X_test, + y_test, + f"{dataset_name}_{model_info[dataset_name]['name']}", + features + ) + + # Store results + evaluation_results[dataset_name] = { + 'model_name': model_info[dataset_name]['name'], + 'metrics': metrics, + 'num_features': len(features), + 'features': features + } + + print(f"\nResults for {dataset_name}:") + print(f"Model: {model_info[dataset_name]['name']}") + print(f"Number of features: {len(features)}") + for metric, value in metrics.items(): + print(f"{metric}: {value:.4f}") + + # Save evaluation results + with open('artifacts/models/evaluation_results.json', 'w') as f: + json.dump(evaluation_results, f, indent=4) + + print("\nEvaluation completed! Results saved to artifacts/models/evaluation_results.json") + print("Visualizations saved to artifacts/visualizations/") + +if __name__ == "__main__": + main() diff --git a/projects/prediction/Smoking Prediction/model_improvements.py b/projects/prediction/Smoking Prediction/model_improvements.py new file mode 100644 index 000000000..5de77ff0e --- /dev/null +++ b/projects/prediction/Smoking Prediction/model_improvements.py @@ -0,0 +1,309 @@ + +#? STAGE 7: MODEL IMPROVEMENTS + +import os +import json +import joblib +import numpy as np +import pandas as pd +import matplotlib +matplotlib.use('Agg') # Set non-interactive backend before other matplotlib imports +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.preprocessing import StandardScaler, PolynomialFeatures +from sklearn.model_selection import StratifiedKFold +from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, + roc_auc_score, confusion_matrix, precision_recall_curve, + roc_curve, auc) +from sklearn.ensemble import RandomForestClassifier, VotingClassifier +from sklearn.feature_selection import SelectFromModel +from xgboost import XGBClassifier +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline + +def custom_scorer(y_true, y_pred): + """Custom scorer that emphasizes precision while maintaining other metrics""" + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + # Weight precision more heavily + return (2 * precision + recall + f1) / 4 + +def create_advanced_features(df): + """Create advanced feature set with enhanced interactions""" + # Original features except 'smoking' + original_features = [col for col in df.columns if col != 'smoking'] + + # Enhanced health indicators + df['bmi'] = df['weight(kg)'] / ((df['height(cm)']/100) ** 2) + df['liver_function'] = (df['AST'] + df['ALT'] + df['Gtp']) / 3 + df['cardiovascular_risk'] = (df['systolic'] * df['triglyceride']) / (df['HDL'] + 1) + df['metabolic_index'] = df['fasting blood sugar'] * df['bmi'] / (df['HDL'] + 1) + df['age_health_index'] = df['age'] * df['hemoglobin'] / df['liver_function'] + + # Polynomial features for key health indicators + poly = PolynomialFeatures(degree=2, include_bias=False) + key_features = ['bmi', 'liver_function', 'cardiovascular_risk', 'metabolic_index'] + poly_features = poly.fit_transform(df[key_features]) + poly_names = [f'health_poly_{i}' for i in range(poly_features.shape[1])] + df[poly_names] = poly_features + + # Feature ratios + df['hdl_ldl_ratio'] = df['HDL'] / (df['LDL'] + 1) + df['ast_alt_ratio'] = df['AST'] / (df['ALT'] + 1) + df['bp_ratio'] = df['systolic'] / (df['relaxation'] + 1) + + # All features + all_features = ( + original_features + + poly_names + + ['bmi', 'liver_function', 'cardiovascular_risk', 'metabolic_index', + 'age_health_index', 'hdl_ldl_ratio', 'ast_alt_ratio', 'bp_ratio'] + ) + + # Normalize features + scaler = StandardScaler() + df[all_features] = scaler.fit_transform(df[all_features]) + + return df[all_features] + +def create_efficient_ensemble(dataset_name): + """Create an enhanced voting ensemble with XGBoost and Random Forest""" + if dataset_name == 'archive': + rf = RandomForestClassifier( + n_estimators=1200, + max_depth=10, + min_samples_split=8, + min_samples_leaf=6, + max_features=0.7, + min_impurity_decrease=0.004, + class_weight={0: 1.2, 1: 1}, # Reduced class weight difference + criterion='entropy', + random_state=42, + n_jobs=-1, + bootstrap=True, + oob_score=True, + max_samples=0.85 + ) + + xgb = XGBClassifier( + max_depth=7, + learning_rate=0.03, + n_estimators=400, + min_child_weight=3, + gamma=0.15, + subsample=0.85, + colsample_bytree=0.85, + scale_pos_weight=1.2, # Reduced scale weight + random_state=42, + n_jobs=-1 + ) + + return VotingClassifier( + estimators=[ + ('rf', rf), + ('xgb', xgb) + ], + voting='soft', + weights=[0.5, 0.5] # Equal weights for better balance + ) + else: + # Keep existing XGBoost for ml_olympiad + return XGBClassifier( + max_depth=5, + learning_rate=0.1, + n_estimators=100, + min_child_weight=3, + gamma=0.1, + subsample=0.9, + colsample_bytree=0.8, + random_state=42, + n_jobs=-1, + tree_method='hist', + eval_metric='logloss', + enable_categorical=False + ) + +def select_best_features(X, y, threshold=0.55): # Adjusted threshold + """Select best features using enhanced selection""" + selector = SelectFromModel( + estimator=XGBClassifier( + n_estimators=250, + max_depth=6, + learning_rate=0.02, + subsample=0.85, + colsample_bytree=0.85, + min_child_weight=4, + random_state=42, + n_jobs=-1 + ), + threshold=threshold + ) + selector.fit(X, y) + return selector + +def create_visualizations(y_true, y_pred, y_pred_proba, dataset_name, model_name): + """Create and save visualization plots for model evaluation""" + os.makedirs('artifacts/visualizations', exist_ok=True) + + # Confusion Matrix + plt.figure(figsize=(10, 8)) + cm = confusion_matrix(y_true, y_pred) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') + plt.title(f'Confusion Matrix - {dataset_name} ({model_name})') + plt.ylabel('True Label') + plt.xlabel('Predicted Label') + plt.savefig(f'artifacts/visualizations/confusion_matrix_{dataset_name}_{model_name}.png') + plt.close() + + # Percentage Confusion Matrix + plt.figure(figsize=(10, 8)) + cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100 + sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap='Blues') + plt.title(f'Confusion Matrix (%) - {dataset_name} ({model_name})') + plt.ylabel('True Label') + plt.xlabel('Predicted Label') + plt.savefig(f'artifacts/visualizations/confusion_matrix_percent_{dataset_name}_{model_name}.png') + plt.close() + + # ROC Curve + plt.figure(figsize=(10, 8)) + fpr, tpr, _ = roc_curve(y_true, y_pred_proba) + roc_auc = auc(fpr, tpr) + plt.plot(fpr, tpr, color='darkorange', lw=2, + label=f'ROC curve (AUC = {roc_auc:.2f})') + plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title(f'ROC Curve - {dataset_name} ({model_name})') + plt.legend(loc="lower right") + plt.savefig(f'artifacts/visualizations/roc_curve_{dataset_name}_{model_name}.png') + plt.close() + + # Precision-Recall Curve + plt.figure(figsize=(10, 8)) + precision, recall, _ = precision_recall_curve(y_true, y_pred_proba) + pr_auc = auc(recall, precision) + plt.plot(recall, precision, color='darkorange', lw=2, + label=f'PR curve (AUC = {pr_auc:.2f})') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title(f'Precision-Recall Curve - {dataset_name} ({model_name})') + plt.legend(loc="lower right") + plt.savefig(f'artifacts/visualizations/pr_curve_{dataset_name}_{model_name}.png') + plt.close() + +def main(): + """Main function to train and evaluate improved models""" + dataset_paths = { + 'ml_olympiad': { + 'train': 'Y:/SmokingML V2/data/processed/ml_olympiad_train.csv', + 'test': 'Y:/SmokingML V2/data/processed/ml_olympiad_test.csv' + }, + 'archive': { + 'train': 'Y:/SmokingML V2/data/processed/archive_train.csv', + 'test': 'Y:/SmokingML V2/data/processed/archive_test.csv' + } + } + + print("Using dataset paths:") + for dataset, paths in dataset_paths.items(): + print(f"{dataset}:") + print(f" Train: {paths['train']}") + print(f" Test: {paths['test']}") + + results = {} + + for dataset_name, paths in dataset_paths.items(): + print(f"\nImproving model for {dataset_name} dataset...") + + print("Loading data...") + # Load data + train_df = pd.read_csv(paths['train']) + test_df = pd.read_csv(paths['test']) + print(f"Loaded training data shape: {train_df.shape}") + print(f"Loaded test data shape: {test_df.shape}") + + print("Creating advanced features...") + # Create advanced features + X_train = create_advanced_features(train_df) + y_train = train_df['smoking'] + X_test = create_advanced_features(test_df) + y_test = test_df['smoking'] + print(f"Features created. Training features shape: {X_train.shape}") + + print("Selecting best features...") + # Feature selection + selector = select_best_features(X_train, y_train, threshold='median') + X_train_selected = selector.transform(X_train) + X_test_selected = selector.transform(X_test) + print(f"Selected {X_train_selected.shape[1]} features") + + # Apply modified SMOTE for archive dataset + if dataset_name == 'archive': + print("Applying SMOTE resampling...") + smote = SMOTE( + random_state=42, + k_neighbors=5, + sampling_strategy=0.85 + ) + X_train_selected, y_train = smote.fit_resample(X_train_selected, y_train) + print(f"After SMOTE - Training data shape: {X_train_selected.shape}") + + print("Training model...") + # Create and train model + model = create_efficient_ensemble(dataset_name) + model.fit(X_train_selected, y_train) + + print("Making predictions...") + # Get predictions + y_pred = model.predict(X_test_selected) + y_pred_proba = model.predict_proba(X_test_selected)[:, 1] + + print("Calculating metrics...") + # Calculate metrics + metrics = { + 'accuracy': float(accuracy_score(y_test, y_pred)), + 'precision': float(precision_score(y_test, y_pred)), + 'recall': float(recall_score(y_test, y_pred)), + 'f1': float(f1_score(y_test, y_pred)), + 'roc_auc': float(roc_auc_score(y_test, y_pred_proba)) + } + + print("Creating visualizations...") + # Create visualizations + model_name = 'Ensemble' if dataset_name == 'archive' else 'XGBoost' + create_visualizations(y_test, y_pred, y_pred_proba, dataset_name, model_name) + + # Save results + results[dataset_name] = { + 'metrics': metrics, + 'n_features_selected': int(X_train_selected.shape[1]), + 'features': list(X_train.columns[selector.get_support()]) + } + + # Save model and feature selector + model_artifacts = { + 'model': model, + 'selector': selector, + 'feature_names': list(X_train.columns) + } + model_path = f"models/{dataset_name}_improved_final.pkl" + joblib.dump(model_artifacts, model_path) + print(f"Saved improved model to {model_path}") + + print(f"\nFinal Results for {dataset_name}:") + for metric, value in metrics.items(): + print(f"{metric}: {value:.4f}") + + # Save results + os.makedirs('artifacts/improvements', exist_ok=True) + with open('artifacts/improvements/final_results.json', 'w') as f: + json.dump(results, f, indent=4) + + print("\nFinal improvements completed! Results saved to artifacts/improvements/final_results.json") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/projects/prediction/Smoking Prediction/model_optimization.py b/projects/prediction/Smoking Prediction/model_optimization.py new file mode 100644 index 000000000..aecf654d9 --- /dev/null +++ b/projects/prediction/Smoking Prediction/model_optimization.py @@ -0,0 +1,211 @@ + +# ? STAGE 5: MODEL OPTIMIZATION + + +import os +import json +import joblib +import numpy as np +import pandas as pd +from sklearn.model_selection import RandomizedSearchCV, cross_val_score +import scipy.stats as stats +from sklearn.ensemble import RandomForestClassifier +from xgboost import XGBClassifier +from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score + +def create_balanced_scorer(dataset_name): + """Create a custom scorer that ensures all metrics improve for the specific dataset""" + def balanced_scorer(y_true, y_pred): + # Calculate all metrics + acc = accuracy_score(y_true, y_pred) + prec = precision_score(y_true, y_pred) + rec = recall_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + + # Select thresholds based on dataset + if dataset_name == 'ml_olympiad': + thresholds = { + 'accuracy': 0.7777, + 'precision': 0.7203, + 'recall': 0.7981, + 'f1': 0.7572 + } + else: # archive dataset + thresholds = { + 'accuracy': 0.7724, + 'precision': 0.6957, + 'recall': 0.6768, + 'f1': 0.6861 + } + + # Calculate improvements relative to thresholds + acc_imp = (acc - thresholds['accuracy']) + prec_imp = (prec - thresholds['precision']) + rec_imp = (rec - thresholds['recall']) + f1_imp = (f1 - thresholds['f1']) + + # If any metric is below threshold, heavily penalize + if acc < thresholds['accuracy'] or prec < thresholds['precision'] or \ + rec < thresholds['recall'] or f1 < thresholds['f1']: + return -100.0 # Strong penalty for any decrease + + # Otherwise, reward based on minimum improvement + min_improvement = min(acc_imp, prec_imp, rec_imp, f1_imp) + avg_improvement = (acc_imp + prec_imp + rec_imp + f1_imp) / 4 + + # Combine minimum and average improvements + # This ensures we prioritize solutions where all metrics improve + return min_improvement + avg_improvement + + return make_scorer(balanced_scorer, greater_is_better=True) + +def optimize_model(X, y, model_type, param_grid, dataset_name): + """Optimize model hyperparameters using GridSearchCV with strict improvement requirements""" + # Create dataset-specific balanced scorer + balanced_scorer = create_balanced_scorer(dataset_name) + + # Perform grid search with reduced CV to speed up search + search = RandomizedSearchCV( + estimator=model_type, + param_distributions=param_grid, + n_iter=30, + scoring=balanced_scorer, + cv=3, + verbose=2, + random_state=42, + n_jobs=-1, + error_score='raise' + ) + + search.fit(X, y) + + # Get predictions using best model + y_pred = search.best_estimator_.predict(X) + + # Calculate metrics + metrics = { + 'accuracy': accuracy_score(y, y_pred), + 'precision': precision_score(y, y_pred), + 'recall': recall_score(y, y_pred), + 'f1': f1_score(y, y_pred) + } + + return search.best_estimator_,{ + 'best_params': search.best_params_, + 'best_score': float(search.best_score_), + 'cv_results': metrics + } + + +def main(): + """Main function to optimize models""" + # Load data + dataset_paths = { + 'ml_olympiad': { + 'train': 'Y:/SmokingML V2/data/processed/ml_olympiad_train.csv', + 'test': 'Y:/SmokingML V2/data/processed/ml_olympiad_test.csv' + }, + 'archive': { + 'train': 'Y:/SmokingML V2/data/processed/archive_train.csv', + 'test': 'Y:/SmokingML V2/data/processed/archive_test.csv' + } + } + + # Load model information + with open('artifacts/models/model_info.json', 'r') as f: + model_info = json.load(f) + + # Updated parameter grids with more focused ranges + param_grids = { + 'XGBoost': { + 'max_depth': [5, 6, 7], + 'learning_rate': stats.uniform(0.01, 0.1), + 'n_estimators': [300, 400, 500], + 'min_child_weight': [2, 3, 4], + 'gamma': stats.uniform(0, 0.3), + 'subsample': stats.uniform(0.7, 0.3), + 'colsample_bytree': stats.uniform(0.7, 0.3), + 'scale_pos_weight': [1.0, 1.1], + 'reg_alpha': stats.uniform(0.0, 0.3), + 'reg_lambda': stats.uniform(1.0, 2.0), + }, + 'Random_Forest': { + 'n_estimators': [200, 300], # Reduced from 3 values to 2 + 'max_depth': [15, 20], # Still allows deep trees but not overly large + 'min_samples_split': [4], # Fixed to one optimal value + 'min_samples_leaf': [2], # Fixed to one optimal value + 'max_features': ['sqrt'], # Typically best for classification + 'class_weight': ['balanced'] # Good for imbalance handling + } + } + + optimization_results = {} + + for dataset_name, paths in dataset_paths.items(): + print(f"\nOptimizing model for {dataset_name} dataset...") + + # Load training data + train_df = pd.read_csv(paths['train']) + test_df = pd.read_csv(paths['test']) + + # Get features from model info + features = model_info[dataset_name]['features'] + + # Prepare data + X_train = train_df[features] + y_train = train_df['smoking'] + X_test = test_df[features] + y_test = test_df['smoking'] + + # Select model type and parameter grid + model_name = model_info[dataset_name]['name'] + # Using XGBoost for both datasets + model_type = XGBClassifier( + random_state=42, + eval_metric='logloss', + enable_categorical=False + ) + param_grid = param_grids['XGBoost'] # Use same parameter grid for both datasets + + + # Optimize model + print(f"Performing grid search with cross-validation for {model_name}...") + best_model, cv_results = optimize_model(X_train, y_train, model_type, param_grid, dataset_name) + + # Evaluate on test set + y_pred = best_model.predict(X_test) + test_metrics = { + 'accuracy': accuracy_score(y_test, y_pred), + 'precision': precision_score(y_test, y_pred), + 'recall': recall_score(y_test, y_pred), + 'f1': f1_score(y_test, y_pred) + } + + # Save optimized model + model_path = f"models/{dataset_name}_{model_name}_optimized.pkl" + joblib.dump(best_model, model_path) + + # Store results + optimization_results[dataset_name] = { + 'model_name': model_name, + 'best_params': cv_results['best_params'], + 'cv_scores': cv_results['cv_results'], + 'test_scores': test_metrics, + 'model_path': model_path + } + + print(f"\nOptimization results for {dataset_name}:") + print(f"Best parameters: {cv_results['best_params']}") + print("\nTest set scores:") + for metric, score in test_metrics.items(): + print(f"{metric}: {score:.4f}") + + # Save optimization results + os.makedirs('artifacts/optimization', exist_ok=True) + with open('artifacts/optimization/optimization_results.json', 'w') as f: + json.dump(optimization_results, f, indent=4) + + print("\nOptimization completed! Results saved to artifacts/optimization/optimization_results.json") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/projects/prediction/Smoking Prediction/model_training.py b/projects/prediction/Smoking Prediction/model_training.py new file mode 100644 index 000000000..b864754af --- /dev/null +++ b/projects/prediction/Smoking Prediction/model_training.py @@ -0,0 +1,112 @@ + +#? STAGE 4: MODEL TRAINING + +import os +import joblib +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from xgboost import XGBClassifier +from sklearn.metrics import accuracy_score, confusion_matrix, classification_report + +# Paths to datasets +dataset_paths = { + 'ml_olympiad': { + 'train': 'Y:/SmokingML V2/data/processed/ml_olympiad_train.csv', + 'test': 'Y:/SmokingML V2/data/processed/ml_olympiad_test.csv' + }, + 'archive': { + 'train': 'Y:/SmokingML V2/data/processed/archive_train.csv', + 'test': 'Y:/SmokingML V2/data/processed/archive_test.csv' + } +} + +# Directory to save models +os.makedirs("models", exist_ok=True) + +# Best performing models for each dataset +models = { + 'ml_olympiad': { + 'name': 'XGBoost', + 'model': XGBClassifier( + random_state=42, + eval_metric='logloss', + enable_categorical=False, # Modern replacement for use_label_encoder + verbosity=1 + ) + }, + 'archive': { + 'name': 'XGBoost', + 'model': XGBClassifier( + random_state=42, + eval_metric='logloss', + enable_categorical=False, # Modern replacement for use_label_encoder + verbosity=1 + ) + } +} + +# Dictionary to store trained models and their metrics +model_info = {} + +for dataset_name, paths in dataset_paths.items(): + print(f"\nProcessing {dataset_name} dataset...") + + # Load datasets + train_df = pd.read_csv(paths['train']) + test_df = pd.read_csv(paths['test']) + + # Get all features except 'smoking' (target) + features = [col for col in train_df.columns if col != 'smoking'] + + # Split into features and target + x_train = train_df[features] + y_train = train_df['smoking'] + x_val = test_df[features] + y_val = test_df['smoking'] + + print(f"\n======= Dataset Info: {dataset_name.replace('_', ' ').title()} =======") + print(f"Training data shape: {x_train.shape}") + print(f"Number of features: {len(features)}") + print("Features:", features) + + # Store model info + model_info[dataset_name] = { + 'name': models[dataset_name]['name'], + 'features': features # Store all features for this dataset + } + + # Get and train the appropriate model + model = models[dataset_name]['model'] + print(f"\nTraining {models[dataset_name]['name']} on {dataset_name} dataset...") + model.fit(x_train, y_train) + + # Save model + model_filename = f"models/{dataset_name}_{models[dataset_name]['name']}.pkl" + joblib.dump(model, model_filename) + print(f"Saved {models[dataset_name]['name']} model at {model_filename}") + + # Evaluate model + y_pred = model.predict(x_val) + accuracy = accuracy_score(y_val, y_pred) + print(f"\n{models[dataset_name]['name']} Accuracy on {dataset_name}: {accuracy:.4f}") + + # Store metrics in model_info + model_info[dataset_name].update({ + 'accuracy': accuracy, + 'model_path': model_filename + }) + + # Extra Evaluation Metrics + print(f"\nConfusion Matrix ({models[dataset_name]['name']} - {dataset_name}):") + print(confusion_matrix(y_val, y_pred)) + + print(f"\nClassification Report ({models[dataset_name]['name']} - {dataset_name}):") + print(classification_report(y_val, y_pred)) + +# Save model information for use in evaluation and API +model_info_path = "artifacts/models/model_info.json" +os.makedirs(os.path.dirname(model_info_path), exist_ok=True) +with open(model_info_path, 'w') as f: + import json + json.dump(model_info, f, indent=4) diff --git a/projects/prediction/SmokingPredictionModel b/projects/prediction/SmokingPredictionModel deleted file mode 160000 index 5f8875eb0..000000000 --- a/projects/prediction/SmokingPredictionModel +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5f8875eb0244aca5d1d0d3b43ba54d20e61ad396