diff --git a/projects/prediction/Smoking Prediction/__init__.py b/projects/prediction/Smoking Prediction/__init__.py new file mode 100644 index 000000000..3f0d20d25 --- /dev/null +++ b/projects/prediction/Smoking Prediction/__init__.py @@ -0,0 +1,3 @@ +""" +Components module initialization +""" \ No newline at end of file diff --git a/projects/prediction/Smoking Prediction/data_ingestion.py b/projects/prediction/Smoking Prediction/data_ingestion.py new file mode 100644 index 000000000..cf6ef6318 --- /dev/null +++ b/projects/prediction/Smoking Prediction/data_ingestion.py @@ -0,0 +1,58 @@ + +#? STAGE 1: DATA INGESTION + +import os +import pandas as pd +from sklearn.model_selection import train_test_split + +pd.set_option('display.max_columns', None) + +class DataIngestion: + def __init__(self,dataset_paths): + """ + dataset_paths: dictionary containing dataset paths as keys and their paths as values + Example: + { + "dataset1": {"train": "path/to/dataset1_train.csv", "test": "path/to/dataset1_test.csv"} + "dataset2": {"train": "path/to/dataset2_train.csv", "test": "path/to/dataset2_test.csv"} + } + """ + self.dataset_paths = dataset_paths + + def load_data(self): + datasets = {} + for dataset_name, paths in self.dataset_paths.items(): + # Load training data + train_df = pd.read_csv(paths["train"]) + + # Split into train and test + train_data, test_data = train_test_split( + train_df, test_size=0.2, random_state=42 + ) + + # Store in nested structure + datasets[dataset_name] = { + "train": train_data, + "test": test_data + } + + return datasets + +dataset_paths = { + "ml-olympiad-smoking": { + "train": "Y:/SmokingML V2/data/raw/ml-olympiad-smoking/train.csv" + }, + "archive": { + "train": "Y:/SmokingML V2/data/raw/archive/train_dataset.csv" + } +} + +# Create data ingestion object and load data +data_ingestion = DataIngestion(dataset_paths) +datasets = data_ingestion.load_data() + +# Now we can safely access the train/test splits +print("ML Olympiad Training Data Type:", type(datasets["ml-olympiad-smoking"]["train"])) +print("ML Olympiad Training Data Shape:", datasets["ml-olympiad-smoking"]["train"].shape) +print("Archive Training Data Type:", type(datasets["archive"]["train"])) +print("Archive Training Data Shape:", datasets["archive"]["train"].shape) diff --git a/projects/prediction/Smoking Prediction/data_preprocessing.py b/projects/prediction/Smoking Prediction/data_preprocessing.py new file mode 100644 index 000000000..9894b0674 --- /dev/null +++ b/projects/prediction/Smoking Prediction/data_preprocessing.py @@ -0,0 +1,224 @@ + +#? STAGE 2: DATA PREPROCESSING + +#* Importing dependencies +import pandas as pd +import numpy as np +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.feature_selection import VarianceThreshold, mutual_info_classif +from sklearn.decomposition import PCA +from sklearn.model_selection import train_test_split +import seaborn as sns +import matplotlib.pyplot as plt +import os +from src.components.data_ingestion import datasets + + +#* Define Preprocessing Function +def preprocess_data(train_df, test_df): + # Store target variable + train_target = train_df['smoking'] + test_target = test_df['smoking'] + + # Remove target from features + train_features = train_df.drop('smoking', axis=1) + test_features = test_df.drop('smoking', axis=1) + + # Get numeric columns excluding target + num_cols = train_features.select_dtypes(include=['int64', 'float64']).columns.tolist() + + # Handle missing values for numeric columns + imputer = SimpleImputer(strategy='mean') + train_features[num_cols] = imputer.fit_transform(train_features[num_cols]) + test_features[num_cols] = imputer.transform(test_features[num_cols]) + + # Handle categorical values + cat_cols = train_features.select_dtypes(include=['object']).columns + encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) + + # Encode categorical columns + if len(cat_cols) > 0: + train_encoded = pd.DataFrame( + encoder.fit_transform(train_features[cat_cols]), + index=train_features.index, + columns=encoder.get_feature_names_out(cat_cols) + ) + test_encoded = pd.DataFrame( + encoder.transform(test_features[cat_cols]), + index=test_features.index, + columns=encoder.get_feature_names_out(cat_cols) + ) + + # Drop original categorical columns and reset index + train_features = train_features.drop(cat_cols, axis=1) + test_features = test_features.drop(cat_cols, axis=1) + + # Concatenate encoded features + train_features = pd.concat([train_features, train_encoded], axis=1) + test_features = pd.concat([test_features, test_encoded], axis=1) + + # Feature Scaling - only scale numeric columns + scaler = StandardScaler() + train_features[num_cols] = scaler.fit_transform(train_features[num_cols]) + test_features[num_cols] = scaler.transform(test_features[num_cols]) + + # Split features and target + X = train_features + y = train_target + + # Split training data into train and validation sets + x_train, x_val, y_train, y_val = train_test_split( + X, y, + test_size=0.2, + random_state=42 + ) + + # Store selected features + selected_features = x_train.columns.tolist() + + # Return all 5 expected values + return x_train, x_val, y_train, y_val, selected_features + + +def remove_low_variance_features(train_df, test_df, threshold=0.01): + train_target = train_df['smoking'] if 'smoking' in train_df.columns else None + train_features = train_df.drop('smoking', axis=1) if 'smoking' in train_df.columns else train_df + + test_target = test_df['smoking'] if 'smoking' in test_df.columns else None + test_features = test_df.drop('smoking', axis=1) if 'smoking' in test_df.columns else test_df + + selector = VarianceThreshold(threshold) + train_features_var = selector.fit_transform(train_features) + test_features_var = selector.transform(test_features) + + selected_columns = train_features.columns[selector.get_support()] + + train_selected = pd.DataFrame(train_features_var, columns=selected_columns, index=train_df.index) + test_selected = pd.DataFrame(test_features_var, columns=selected_columns, index=test_df.index) + + if train_target is not None: + train_selected['smoking'] = train_target + if test_target is not None: + test_selected['smoking'] = test_target + + return train_selected, test_selected + + +def remove_highly_correlated_features(train_df, test_df, threshold=0.9): + correlation_matrix = train_df.corr() + upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)) + drop_cols = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)] + return train_df.drop(columns=drop_cols), test_df.drop(columns=drop_cols) + + +def select_features_by_mutual_info(train_df, test_df, target_column, num_features=15): + X = train_df.drop(columns=[target_column]) + y = train_df[target_column] + + mutual_info = mutual_info_classif(X, y, discrete_features='auto') + feature_scores = pd.Series(mutual_info, index=X.columns) + selected_features = feature_scores.nlargest(num_features).index.to_list() + + if target_column in test_df.columns: + return train_df[selected_features + [target_column]], test_df[selected_features + [target_column]] + else: + return train_df[selected_features + [target_column]], test_df[selected_features] + + +def apply_pca(train_df, test_df, n_components=10): + pca = PCA(n_components=n_components) + train_pca = pca.fit_transform(train_df) + test_pca = pca.transform(test_df) + return pd.DataFrame(train_pca), pd.DataFrame(test_pca) + + +if __name__ == "__main__": + #* Load both Train and Test Datasets + train_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["train"]) + test_ml = pd.DataFrame(datasets["ml-olympiad-smoking"]["test"]) + train_archive = pd.DataFrame(datasets["archive"]["train"]) + test_archive = pd.DataFrame(datasets["archive"]["test"]) + + print("DISPLAY BASIC INFORMATION") + print("ML Olympiad Train Data Shape:", train_ml.shape) + print("ML Olympiad Test Data Shape:", test_ml.shape) + print(train_ml.head()) + print("Archive Train Data Shape:", train_archive.shape) + print("Archive Test Data Shape:", test_archive.shape) + print(test_archive.head()) + + #* Apply Preprocessing to all datasets + x_train_ml, x_val_ml, y_train_ml, y_val_ml, selected_features_ml = preprocess_data(train_ml, test_ml) + x_train_archive, x_val_archive, y_train_archive, y_val_archive, selected_features_archive = preprocess_data(train_archive, test_archive) + + preprocessed_data_paths = { + "ml-olympiad-smoking": { + "train": "Y:/SmokingML V2/data/processed/ml_olympiad_train.csv", + "test": "Y:/SmokingML V2/data/processed/ml_olympiad_test.csv" + }, + "archive": { + "train": "Y:/SmokingML V2/data/processed/archive_train.csv", + "test": "Y:/SmokingML V2/data/processed/archive_test.csv" + } + } + + for dataset_name, paths in preprocessed_data_paths.items(): + for key, path in paths.items(): + os.makedirs(os.path.dirname(path), exist_ok=True) + + pd.concat([x_train_ml, y_train_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["train"], index=False) + pd.concat([x_val_ml, y_val_ml], axis=1).to_csv(preprocessed_data_paths["ml-olympiad-smoking"]["test"], index=False) + pd.concat([x_train_archive, y_train_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["train"], index=False) + pd.concat([x_val_archive, y_val_archive], axis=1).to_csv(preprocessed_data_paths["archive"]["test"], index=False) + + print("Preprocessed data has been saved successfully!") + + #* Variance Thresholding + preprocessed_train_ml, preprocessed_test_ml = remove_low_variance_features(pd.concat([x_train_ml, y_train_ml], axis=1), pd.concat([x_val_ml, y_val_ml], axis=1)) + preprocessed_train_archive, preprocessed_test_archive = remove_low_variance_features(pd.concat([x_train_archive, y_train_archive], axis=1), pd.concat([x_val_archive, y_val_archive], axis=1)) + + #* Feature Selection + preprocessed_train_ml, preprocessed_test_ml = select_features_by_mutual_info(preprocessed_train_ml, preprocessed_test_ml, target_column='smoking') + preprocessed_train_archive, preprocessed_test_archive = select_features_by_mutual_info(preprocessed_train_archive, preprocessed_test_archive, target_column='smoking') + + #* ✅ Optional assertion checks + assert 'smoking' in preprocessed_train_ml.columns, "Target column 'smoking' missing in training set!" + assert 'smoking' in preprocessed_test_ml.columns, "Target column 'smoking' missing in test set!" + assert 'smoking' in preprocessed_train_archive.columns, "Target column 'smoking' missing in archive training set!" + assert 'smoking' in preprocessed_test_archive.columns, "Target column 'smoking' missing in archive test set!" + + #* ✅ Debug: Show absolute save paths + print("\n✅ Saving preprocessed files to:") + print("ML Train Path :", os.path.abspath("Y:/SmokingML V2/data/processed/train_ml.csv")) + print("ML Test Path :", os.path.abspath("Y:/SmokingML V2/data/processed/test_ml.csv")) + print("Archive Train Path :", os.path.abspath("Y:/SmokingML V2/data/processed/train_archive.csv")) + print("Archive Test Path :", os.path.abspath("Y:/SmokingML V2/data/processed/test_archive.csv")) + + #* Save final preprocessed files + preprocessed_train_ml.to_csv("Y:/SmokingML V2/data/processed/train_ml.csv", index=False) + preprocessed_test_ml.to_csv("Y:/SmokingML V2/data/processed/test_ml.csv", index=False) + preprocessed_train_archive.to_csv("Y:/SmokingML V2/data/processed/train_archive.csv", index=False) + preprocessed_test_archive.to_csv("Y:/SmokingML V2/data/processed/test_archive.csv", index=False) + + print("Feature Engineering and Selection completed Successfully!") + + + import json + + #* Save selected features to JSON for both datasets + selected_features_dir = "Y:/SmokingML V2/artifacts/models" + os.makedirs(selected_features_dir, exist_ok=True) + + # Remove 'smoking' from selected columns before saving (optional based on use-case) + selected_columns_olympiad = [col for col in preprocessed_train_ml.columns if col != 'smoking'] + selected_columns_archive = [col for col in preprocessed_train_archive.columns if col != 'smoking'] + + # Save to JSON + with open(os.path.join(selected_features_dir, "feature_columns_olympiad.json"), "w") as f: + json.dump(selected_columns_olympiad, f, indent=4) + + with open(os.path.join(selected_features_dir, "feature_columns_archive.json"), "w") as f: + json.dump(selected_columns_archive, f, indent=4) + + print("✅ Feature columns JSON files saved successfully!") diff --git a/projects/prediction/Smoking Prediction/feature_engineering.py b/projects/prediction/Smoking Prediction/feature_engineering.py new file mode 100644 index 000000000..f71554373 --- /dev/null +++ b/projects/prediction/Smoking Prediction/feature_engineering.py @@ -0,0 +1,106 @@ + +#? STAGE 3: FEATURE ENGINEERING + +import json +import numpy as np +import pandas as pd +from typing import Dict, List +from sklearn.preprocessing import PolynomialFeatures +from pathlib import Path + +class FeatureEngineer: + def __init__(self): + self.rules = self._load_rules() + + def _load_rules(self) -> Dict: + """Load feature engineering rules from config file""" + config_path = Path("config/feature_engineering_rules.json") + if not config_path.exists(): + return self._get_default_rules() + + with open(config_path, "r") as f: + return json.load(f) + + def _get_default_rules(self) -> Dict: + """Default feature engineering rules if no config exists""" + return { + "health_indicators": [ + { + "name": "bmi_health_index", + "formula": "weight / (height ** 2)", + "enabled": True, + "description": "BMI-based health indicator" + } + ], + "polynomial_features": ["age", "weight", "height"], + "feature_ratios": [ + { + "name": "age_bmi_ratio", + "formula": "age / bmi_health_index", + "enabled": True + } + ], + "polynomial_degree": 2 + } + + def create_health_indicators(self, df: pd.DataFrame) -> pd.DataFrame: + """Generate health indicator features based on configured rules""" + result = df.copy() + + for rule in self.rules["health_indicators"]: + if rule["enabled"]: + try: + result[rule["name"]] = eval(rule["formula"], + {"__builtins__": None}, + {**dict(result), "np": np}) + except Exception as e: + print(f"Failed to calculate {rule['name']}: {str(e)}") + + return result + + def create_polynomial_features(self, df: pd.DataFrame) -> pd.DataFrame: + """Generate polynomial features for specified columns""" + result = df.copy() + features_to_transform = [col for col in self.rules["polynomial_features"] + if col in df.columns] + + if not features_to_transform: + return result + + poly = PolynomialFeatures( + degree=self.rules["polynomial_degree"], + include_bias=False + ) + + poly_features = poly.fit_transform(df[features_to_transform]) + feature_names = poly.get_feature_names_out(features_to_transform) + + # Add only the interaction terms and higher degree terms + for i, name in enumerate(feature_names[len(features_to_transform):], + start=len(features_to_transform)): + result[f"poly_{name}"] = poly_features[:, i] + + return result + + def create_feature_ratios(self, df: pd.DataFrame) -> pd.DataFrame: + """Generate feature ratios based on configured rules""" + result = df.copy() + + for rule in self.rules["feature_ratios"]: + if rule["enabled"]: + try: + result[rule["name"]] = eval(rule["formula"], + {"__builtins__": None}, + {**dict(result), "np": np}) + except Exception as e: + print(f"Failed to calculate {rule['name']}: {str(e)}") + + return result + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Apply all feature engineering transformations""" + result = df.copy() + result = self.create_health_indicators(result) + result = self.create_polynomial_features(result) + result = self.create_feature_ratios(result) + return result \ No newline at end of file diff --git a/projects/prediction/Smoking Prediction/model_deployment.py b/projects/prediction/Smoking Prediction/model_deployment.py new file mode 100644 index 000000000..93ac94cff --- /dev/null +++ b/projects/prediction/Smoking Prediction/model_deployment.py @@ -0,0 +1,847 @@ + +#? STAGE 8: MODEL DEPLOYMENT + +from fastapi import FastAPI, HTTPException, Path, Body +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field +import pandas as pd +import joblib +import os +import sys +import logging +from datetime import datetime +from dotenv import load_dotenv +from fastapi.openapi.utils import get_openapi +from sklearn.ensemble import VotingClassifier +from sklearn.preprocessing import PolynomialFeatures +from typing import Optional, List, Dict +from contextlib import asynccontextmanager +import socket +import uvicorn +from fastapi.openapi.docs import get_swagger_ui_html +from fastapi.responses import HTMLResponse +import json +from .feature_engineering import FeatureEngineer + +# Configure logging to both file and console with maximum verbosity +LOG_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'logs') +API_LOG_DIR = os.path.join(LOG_DIR, 'api') +DEPLOYMENT_LOG_DIR = os.path.join(LOG_DIR, 'deployment') +ERROR_LOG_DIR = os.path.join(LOG_DIR, 'errors') + +# Create log directories if they don't exist +os.makedirs(LOG_DIR, exist_ok=True) +os.makedirs(API_LOG_DIR, exist_ok=True) +os.makedirs(DEPLOYMENT_LOG_DIR, exist_ok=True) +os.makedirs(ERROR_LOG_DIR, exist_ok=True) + +# Configure logging with organized file structure +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(os.path.join(API_LOG_DIR, f'api_{datetime.now().strftime("%Y%m%d")}.log')), + logging.FileHandler(os.path.join(DEPLOYMENT_LOG_DIR, f'deployment_{datetime.now().strftime("%Y%m%d")}.log')), + logging.StreamHandler(sys.stdout) + ] +) + +# Configure error logging separately +error_handler = logging.FileHandler(os.path.join(ERROR_LOG_DIR, f'error_{datetime.now().strftime("%Y%m%d")}.log')) +error_handler.setLevel(logging.ERROR) +error_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +logging.getLogger().addHandler(error_handler) + +logger = logging.getLogger(__name__) + +# Load environment variables +load_dotenv() + +# Define model path and dictionary to hold loaded models +MODEL_PATH = os.getenv("MODEL_PATH", os.path.abspath(os.path.join(os.path.dirname(__file__), "../../models"))) +logger.info(f"Using model path: {MODEL_PATH}") +models = {} +model_parameters = {} + +# Define best models to be loaded for deployment +BEST_MODELS = { + "ml_olympiad_improved_final": "ML Olympiad – Improved XGBoost", + "archive_improved_final": "Archive – Improved Ensemble" +} + +# Default model parameters +DEFAULT_MODEL_PARAMETERS = { + "confidence_threshold": 0.5, + "class_weights": {"0": 1.0, "1": 1.0}, + "health_indicator_thresholds": { + "bmi": {"low": 18.5, "high": 25.0}, + "liver_function": {"low": 10.0, "high": 50.0}, + "cardiovascular_risk": {"low": 1.0, "high": 5.0}, + "metabolic_index": {"low": 0.5, "high": 2.5} + } +} + +class ModelParameters(BaseModel): + confidence_threshold: Optional[float] = Field(0.5, ge=0.0, le=1.0) + class_weights: Optional[Dict[str, float]] = Field( + default_factory=lambda: {"0": 1.0, "1": 1.0} + ) + health_indicator_thresholds: Optional[Dict[str, Dict[str, float]]] = Field( + default_factory=lambda: DEFAULT_MODEL_PARAMETERS["health_indicator_thresholds"] + ) + + class Config: + json_schema_extra = { + "example": DEFAULT_MODEL_PARAMETERS + } + +# Define lifespan to load models and handle startup logging +@asynccontextmanager +async def lifespan(app: FastAPI): + try: + logger.info(f"Starting model loading from {MODEL_PATH}") + if not os.path.exists(MODEL_PATH): + error_msg = f"Model directory not found at {MODEL_PATH}" + logger.error(error_msg) + raise Exception(error_msg) + + model_files = [f for f in os.listdir(MODEL_PATH) if f.endswith('.pkl')] + logger.info(f"Found model files: {model_files}") + + if not model_files: + error_msg = f"No .pkl model files found in {MODEL_PATH}" + logger.error(error_msg) + raise Exception(error_msg) + + for model_file in model_files: + model_name = model_file.replace('.pkl', '') + if model_name in BEST_MODELS: + model_path = os.path.join(MODEL_PATH, model_file) + try: + logger.info(f"Loading model {model_name} from {model_path}") + model_artifacts = joblib.load(model_path) + models[model_name] = model_artifacts['model'] + logger.info(f"Successfully loaded model: {model_name}") + except Exception as e: + logger.error(f"Error loading model {model_name}: {str(e)}", exc_info=True) + raise + + if not models: + error_msg = f"No best models found for deployment in {MODEL_PATH}. Expected models: {list(BEST_MODELS.keys())}" + logger.error(error_msg) + raise Exception(error_msg) + + # Startup logging + logger.info("=== Server Starting ===") + logger.info(get_ip()) + logger.info("You can access the API at:") + logger.info(" http://127.0.0.1:8000") + logger.info(" http://localhost:8000") + logger.info("API documentation available at:") + logger.info(" http://127.0.0.1:8000/docs") + logger.info(" http://localhost:8000/docs") + logger.info("Try both URLs if one doesn't work") + + logger.info("All models loaded successfully. Ready to serve.") + except Exception as e: + logger.error(f"Error during startup: {str(e)}", exc_info=True) + raise e + yield + # Cleanup + logger.info("Cleaning up models") + models.clear() + +# Initialize FastAPI app with lifespan +app = FastAPI( + lifespan=lifespan, + title="Smoking Status Prediction API", + description="API for predicting smoking status using machine learning models", + version="2.0.0", + docs_url=None, + redoc_url=None +) + +# Update CORS middleware with more specific origins and headers +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Allow all origins for testing - restrict this in production + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + expose_headers=["*"] +) + +# Add socket info logging +def get_ip(): + try: + # Get all network interfaces + hostname = socket.gethostname() + local_ip = socket.gethostbyname(hostname) + return f"Hostname: {hostname}, Local IP: {local_ip}" + except Exception as e: + return f"Could not determine IP: {str(e)}" + +# Custom OpenAPI schema +def custom_openapi(): + if app.openapi_schema: + return app.openapi_schema + + openapi_schema = get_openapi( + title="Smoking Status Prediction API", + version="2.0.0", + description="**API for predicting smoking status using best-performing or ensemble ML models**", + routes=app.routes, + ) + + # Define tags with descriptions and colors + openapi_schema["tags"] = [ + { + "name": "Root", + "description": "**Root endpoint operations**", + "x-tag-style": {"background-color": "#FFEB3B"} + }, + { + "name": "Models", + "description": "**Model listing operations**", + "x-tag-style": {"background-color": "#FF69B4"} + }, + { + "name": "Health", + "description": "**Health check operations**", + "x-tag-style": {"background-color": "#4CAF50"} + }, + { + "name": "Predictions", + "description": "**Smoking status prediction operations**", + "x-tag-style": {"background-color": "#2196F3"} + }, + { + "name": "Feature Engineering", + "description": "**Feature engineering rules management**", + "x-tag-style": {"background-color": "#9C27B0"} + } + ] + + app.openapi_schema = openapi_schema + return app.openapi_schema + +app.openapi = custom_openapi + +# Define input schema +class SmokingPredictionInput(BaseModel): + height_cm: float = Field(..., alias="height(cm)") + weight_kg: float = Field(..., alias="weight(kg)") + waist_cm: float = Field(..., alias="waist(cm)") + age: float + ALT: float + Gtp: float + HDL: float + LDL: float = Field(0.0) + Cholesterol: float = Field(0.0) + systolic: float + relaxation: float + hemoglobin: float + serum_creatinine: float = Field(..., alias="serum creatinine") + triglyceride: float + AST: Optional[float] = Field(0.0) + dental_caries: Optional[int] = Field(0, alias="dental caries") + eyesight_right: Optional[float] = Field(0.0, alias="eyesight(right)") + eyesight_left: Optional[float] = Field(0.0, alias="eyesight(left)") + fasting_blood_sugar: Optional[float] = Field(0.0, alias="fasting blood sugar") + + class Config: + populate_by_name = True + json_schema_extra = { + "example": { + "height(cm)": 170.0, + "weight(kg)": 70.0, + "waist(cm)": 85.0, + "eyesight(left)": 1.0, + "eyesight(right)": 1.0, + "age": 35.0, + "ALT": 25.0, + "AST": 20.0, + "Gtp": 30.0, + "HDL": 50.0, + "LDL": 100.0, + "Cholesterol": 180.0, + "dental caries": 0, + "fasting blood sugar": 90.0, + "relaxation": 80.0, + "serum creatinine": 1.0, + "triglyceride": 150.0, + "hemoglobin": 15.0, + "systolic": 120.0 + } + } + +# Root endpoint with enhanced response +@app.get("/", tags=["Root"], response_model=dict) +async def root(): + """Root endpoint with detailed API information and status""" + try: + network_info = get_ip() + logger.info(f"Root endpoint accessed. {network_info}") + + response_data = { + "status": "success", + "api_info": { + "name": "Enhanced Smoking Prediction API", + "version": "2.0.0", + "description": "Machine Learning API for Smoking Status Prediction" + }, + "models": { + "available": list(models.keys()), + "total_count": len(models), + "model_path": MODEL_PATH + }, + "endpoints": { + "documentation": "/docs", + "health_check": "/health", + "models_list": "/models", + "prediction": "/predict/{model_name}" + }, + "server_info": { + "status": "healthy", + "network": network_info, + "timestamp": datetime.now().isoformat() + } + } + + return response_data + + except Exception as e: + error_msg = f"Error accessing root endpoint: {str(e)}" + logger.error(error_msg) + raise HTTPException(status_code=500, detail={"error": error_msg}) + +# Health check endpoint +@app.get("/health", tags=["Health"]) +async def health_check(): + logger.info("Health check endpoint accessed") + return { + "status": "healthy", + "models_loaded": list(models.keys()), + "model_path": MODEL_PATH, + "timestamp": datetime.now().isoformat() + } + +# Endpoint to list models +@app.get("/models", tags=["Models"]) +async def list_models(): + logger.info("Models endpoint accessed") + return { + "available_models": BEST_MODELS, + "loaded_models": list(models.keys()), + "total": len(BEST_MODELS), + "model_path": MODEL_PATH + } + +# Prediction endpoint +@app.post("/predict/{model_name}", tags=["Predictions"]) +async def predict( + model_name: str = Path( + ..., + description="Available models: ml_olympiad_improved_final, archive_improved_final" + ), + input_data: SmokingPredictionInput = Body(...) +): + logger.info(f"Prediction requested for model: {model_name}") + try: + # Clean up model name + model_name = model_name.strip() + + if model_name not in models: + error_msg = f"Model '{model_name}' not found. Available models: {list(models.keys())}" + logger.error(error_msg) + raise HTTPException(status_code=404, detail={"error": error_msg}) + + # Get model parameters or use defaults + model_params = model_parameters.get(model_name, DEFAULT_MODEL_PARAMETERS) + confidence_threshold = model_params["confidence_threshold"] + health_thresholds = model_params["health_indicator_thresholds"] + + # Convert input data to DataFrame + input_dict = input_data.dict(by_alias=True) + logger.debug(f"Raw input data: {input_dict}") + data = pd.DataFrame([input_dict]) + + try: + # 1. Initialize all required numeric columns with safe defaults + default_values = { + 'systolic': data.get('systolic', [0.0])[0], + 'triglyceride': data.get('triglyceride', [0.0])[0], + 'HDL': max(data.get('HDL', [1.0])[0], 1.0), # Ensure HDL is at least 1 + 'LDL': data.get('LDL', [0.0])[0], + 'AST': data.get('AST', [0.0])[0], + 'ALT': data.get('ALT', [0.0])[0], + 'Gtp': data.get('Gtp', [0.0])[0], + 'fasting blood sugar': data.get('fasting blood sugar', [0.0])[0] + } + + # Update DataFrame with safe values + for col, value in default_values.items(): + if pd.isna(value): + data[col] = 0.0 if col != 'HDL' else 1.0 + else: + data[col] = value + + logger.debug("Initialized features with safe values") + + # 2. Calculate basic health indicators + data['bmi'] = data['weight(kg)'] / ((data['height(cm)']/100) ** 2) + data['liver_function'] = (data['AST'] + data['ALT'] + data['Gtp']) / 3 + data['cardiovascular_risk'] = (data['systolic'] * data['triglyceride']) / data['HDL'] + data['metabolic_index'] = (data['fasting blood sugar'] * data['bmi']) / data['HDL'] + + # 3. Calculate health status indicators + data['bmi_status'] = ((data['bmi'] >= health_thresholds['bmi']['low']) & + (data['bmi'] <= health_thresholds['bmi']['high'])).astype(int) + + data['liver_status'] = ((data['liver_function'] >= health_thresholds['liver_function']['low']) & + (data['liver_function'] <= health_thresholds['liver_function']['high'])).astype(int) + + data['cv_risk_status'] = ((data['cardiovascular_risk'] >= health_thresholds['cardiovascular_risk']['low']) & + (data['cardiovascular_risk'] <= health_thresholds['cardiovascular_risk']['high'])).astype(int) + + data['metabolic_status'] = ((data['metabolic_index'] >= health_thresholds['metabolic_index']['low']) & + (data['metabolic_index'] <= health_thresholds['metabolic_index']['high'])).astype(int) + + # 4. Calculate additional ratios + data['hdl_ldl_ratio'] = data['HDL'] / (data['LDL'] + 1) + data['ast_alt_ratio'] = data['AST'] / (data['ALT'] + 1) + data['bp_ratio'] = data['systolic'] / (data['relaxation'] + 1) + + # 5. Generate polynomial features based on model type + if model_name == 'ml_olympiad_improved_final': + key_features = ['bmi', 'liver_function', 'cardiovascular_risk', 'metabolic_index'] + poly = PolynomialFeatures(degree=2, include_bias=False) + poly_features = poly.fit_transform(data[key_features]) + for i in range(poly_features.shape[1]): + data[f'health_poly_{i}'] = poly_features[:, i] + else: # archive_improved_final + # For archive model, we only need specific polynomial features + key_features = ['bmi', 'liver_function', 'cardiovascular_risk'] + poly = PolynomialFeatures(degree=2, include_bias=False) + poly_features = poly.fit_transform(data[key_features]) + # Only keep required polynomial features (0, 4, 5) + data['health_poly_0'] = poly_features[:, 0] # First feature + data['health_poly_4'] = poly_features[:, 4] # Fifth feature + data['health_poly_5'] = poly_features[:, 5] # Sixth feature + + logger.debug("All features calculated successfully") + logger.debug(f"Available features: {list(data.columns)}") + + except Exception as e: + error_msg = f"Error calculating health indicators: {str(e)}" + logger.error(error_msg) + logger.error(f"Data state: {data.to_dict()}") + raise HTTPException(status_code=400, detail={"error": error_msg}) + + # Select features based on model type + if model_name == 'ml_olympiad_improved_final': + required_features = [ + "age", "height(cm)", "weight(kg)", "systolic", "relaxation", + "Cholesterol", "triglyceride", "HDL", "LDL", "hemoglobin", + "serum creatinine", "AST", "ALT", "Gtp", "dental caries", + "health_poly_0", "health_poly_1", "health_poly_4", "health_poly_13", + "bmi", "liver_function", "hdl_ldl_ratio", "ast_alt_ratio" + ] + else: # archive_improved_final + required_features = [ + "age", "height(cm)", "weight(kg)", "waist(cm)", "systolic", + "relaxation", "fasting blood sugar", "triglyceride", "HDL", + "LDL", "hemoglobin", "serum creatinine", "ALT", "Gtp", + "dental caries", "health_poly_0", "health_poly_4", "health_poly_5", + "bmi", "liver_function", "hdl_ldl_ratio", "ast_alt_ratio" + ] + + # Create a new DataFrame with only required features in correct order + prediction_data = pd.DataFrame() + for feature in required_features: + if feature not in data.columns: + error_msg = f"Missing required feature: {feature}" + logger.error(error_msg) + raise HTTPException(status_code=400, detail={"error": error_msg}) + prediction_data[feature] = data[feature] + + logger.debug(f"Final features for prediction: {list(prediction_data.columns)}") + + # Make prediction + model = models[model_name] + prediction = model.predict(prediction_data)[0] + probabilities = model.predict_proba(prediction_data)[0] + confidence = float(max(probabilities)) + + # Apply confidence threshold + adjusted_prediction = 1 if confidence >= confidence_threshold and prediction == 1 else 0 + + result = { + "model_used": BEST_MODELS[model_name], + "prediction": int(adjusted_prediction), + "label": "Smoker" if adjusted_prediction == 1 else "Non-smoker", + "confidence": f"{confidence:.2%}", + "confidence_threshold": confidence_threshold, + "health_indicators": { + "bmi_status": bool(data['bmi'].iloc[0] >= health_thresholds['bmi']['low'] and + data['bmi'].iloc[0] <= health_thresholds['bmi']['high']), + "liver_status": bool(data['liver_function'].iloc[0] >= health_thresholds['liver_function']['low'] and + data['liver_function'].iloc[0] <= health_thresholds['liver_function']['high']), + "cardiovascular_status": bool(data['cardiovascular_risk'].iloc[0] >= health_thresholds['cardiovascular_risk']['low'] and + data['cardiovascular_risk'].iloc[0] <= health_thresholds['cardiovascular_risk']['high']), + "metabolic_status": bool(data['metabolic_index'].iloc[0] >= health_thresholds['metabolic_index']['low'] and + data['metabolic_index'].iloc[0] <= health_thresholds['metabolic_index']['high']) + }, + "calculated_features": { + "bmi": float(data['bmi'].iloc[0]), + "liver_function": float(data['liver_function'].iloc[0]), + "cardiovascular_risk": float(data['cardiovascular_risk'].iloc[0]), + "metabolic_index": float(data['metabolic_index'].iloc[0]) + }, + "model_type": "XGBoost" if model_name == "ml_olympiad_improved_final" else "Ensemble", + "features_used": required_features + } + + return result + + except HTTPException: + raise + except Exception as e: + error_msg = f"Error making prediction: {str(e)}" + logger.error(error_msg) + logger.error("Full traceback: ", exc_info=True) + raise HTTPException(status_code=500, detail={"error": error_msg}) + +# Feature engineering rules models +class FeatureEngineeringRule(BaseModel): + name: str + formula: str + enabled: bool = True + description: Optional[str] = None + degree: Optional[int] = Field(default=2, ge=1, le=3) + +class FeatureEngineeringRules(BaseModel): + health_indicators: List[FeatureEngineeringRule] + polynomial_features: List[str] + feature_ratios: List[FeatureEngineeringRule] + polynomial_degree: int = Field(default=2, ge=1, le=3) + +# Endpoint to update feature engineering rules +@app.put("/feature-engineering/rules", tags=["Feature Engineering"]) +async def update_feature_engineering_rules(rules: FeatureEngineeringRules): + """ + Update feature engineering rules including: + - Health indicator calculations + - Polynomial feature generation rules + - Feature ratio calculations + """ + try: + # Save the rules to a configuration file + rules_dict = rules.dict() + os.makedirs("config", exist_ok=True) + with open("config/feature_engineering_rules.json", "w") as f: + json.dump(rules_dict, f, indent=4) + + return { + "status": "success", + "message": "Feature engineering rules updated successfully", + "rules": rules_dict + } + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Failed to update feature engineering rules: {str(e)}" + ) + +@app.get("/docs", include_in_schema=False) +async def custom_swagger_ui_html(): + html_content = """ + + +
+ + +