diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..4c7949f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,87 @@ +# Git +.git +.gitignore +.github + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +pip-log.txt +pip-delete-this-directory.txt +.pytest_cache/ +.coverage +htmlcov/ + +# Node.js +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnp +.pnp.js +frontend/build +frontend/.env.local +frontend/.env.development.local +frontend/.env.test.local +frontend/.env.production.local + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Environment +.env +.env.local +.env.*.local + +# Uploads (will be handled by volumes) +backend/uploads/* +!backend/uploads/.gitkeep + +# Database +*.db +*.sqlite +*.sqlite3 + +# Logs +*.log +logs/ + +# Documentation +docs/assets/ +*.md +!README.md + +# Testing +.pytest_cache/ +coverage/ +.coverage + +# OS +Thumbs.db +.DS_Store diff --git a/Dockerfile.txt b/Dockerfile.txt index 3229b35..e69c806 100644 --- a/Dockerfile.txt +++ b/Dockerfile.txt @@ -1,18 +1,70 @@ -# Use official Python base image -FROM python:3.11-slim +# ML Simulator - Multi-stage Dockerfile +# Author: Akshit +# Date: October 13, 2025 +# Purpose: Containerize the ML Simulator application + +# ================================ +# Stage 1: Build Frontend +# ================================ +FROM node:18-alpine AS frontend-build # Set working directory +WORKDIR /app/frontend + +# Copy frontend package files +COPY frontend/package*.json ./ + +# Install dependencies +RUN npm ci --only=production + +# Copy frontend source code +COPY frontend/ ./ + +# Build the React app +RUN npm run build + +# ================================ +# Stage 2: Backend + Frontend +# ================================ +FROM python:3.9-slim + +# Set environment variables +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# Create app directory WORKDIR /app -# Copy requirements file and install dependencies +# Copy backend requirements and install dependencies COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -# Copy the rest of the app files -COPY . . +# Copy backend code +COPY backend/ ./backend/ + +# Copy built frontend from previous stage +COPY --from=frontend-build /app/frontend/build ./frontend/build + +# Create uploads directory +RUN mkdir -p backend/uploads/datasets backend/uploads/resumes + +# Expose port +EXPOSE 5000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ + CMD curl -f http://localhost:5000/health || exit 1 -# Expose a port if your app runs a server (optional) -# EXPOSE 5000 +# Set working directory to backend +WORKDIR /app/backend -# Default command to run the simulator -CMD ["python", "main.py"] \ No newline at end of file +# Run the application +CMD ["python", "app.py"] diff --git a/Docs/Knn.md b/Docs/Knn.md new file mode 100644 index 0000000..f186a3e --- /dev/null +++ b/Docs/Knn.md @@ -0,0 +1,53 @@ +# K-Nearest Neighbors (KNN) - Documentation + +## ๐Ÿ“‹ Overview + +KNN is a simple, instance-based learning algorithm that classifies data points based on the classes of their k nearest neighbors[web:100][web:102]. + +**Key Characteristics:** +- **Type**: Instance-based Learning +- **Algorithm**: Distance-based classification +- **Output**: Class based on neighbor voting +- **Best For**: Small to medium datasets, pattern recognition + +## ๐ŸŽฏ Purpose and Use Cases + +- **Recommendation Systems**: Similar user preferences +- **Pattern Recognition**: Handwriting, image recognition +- **Anomaly Detection**: Identifying outliers +- **Medical Diagnosis**: Similar patient cases +- **Text Classification**: Document similarity + +## ๐Ÿ“Š Key Parameters + +| Parameter | Description | Default | Recommendation | +|-----------|-------------|---------|----------------| +| **n_neighbors (k)** | Number of neighbors | 5 | 3-15 (odd numbers) | +| **weights** | Vote weighting | uniform | uniform/distance | +| **metric** | Distance measure | euclidean | euclidean/manhattan | + +## ๐Ÿ’ก Choosing K Value + +- **Small k (3-5)**: More sensitive to noise, complex boundaries +- **Large k (10-20)**: Smoother boundaries, may miss patterns +- **Rule of thumb**: โˆšn where n = number of samples +- **Use odd k**: Avoids tie votes in binary classification + +## ๐Ÿ› Common Issues + +### Slow Prediction +- Reduce training data size +- Use approximate methods +- Try other algorithms for large datasets + +### Poor Performance +- Scale features (very important for KNN!) +- Try different k values +- Check for irrelevant features + +--- + +**Last Updated**: October 13, 2025 +**Version**: 1.0 +**Author**: Akshit +**Hacktoberfest 2025 Contribution** ๐ŸŽƒ diff --git a/Docs/Readme.md b/Docs/Readme.md new file mode 100644 index 0000000..812566d --- /dev/null +++ b/Docs/Readme.md @@ -0,0 +1,44 @@ +# ML Simulator - Model Documentation + +Welcome to the ML Simulator documentation! This directory contains comprehensive guides for each machine learning model available in the simulator. + +## ๐Ÿ“š Available Models + +| Model | Type | Documentation | Use Case | +|-------|------|---------------|----------| +| [Logistic Regression](logistic_regression.md) | Classification | Binary classification | Disease prediction, spam detection | +| [Linear Regression](linear_regression.md) | Regression | Continuous prediction | Price prediction, trend analysis | +| [Decision Tree](decision_tree.md) | Classification/Regression | Tree-based decisions | Credit scoring, diagnosis | +| [Random Forest](random_forest.md) | Ensemble | Multiple trees | Complex classification tasks | +| [K-Nearest Neighbors](knn.md) | Classification/Regression | Instance-based | Pattern recognition | +| [Support Vector Machine](svm.md) | Classification | Maximum margin | Text classification, image recognition | + +## ๐Ÿš€ Quick Start + +Each model documentation includes: +- โœ… **Overview**: What the model does and when to use it +- โœ… **How to Run**: Step-by-step instructions +- โœ… **Parameter Explanations**: What each setting means +- โœ… **Plot Interpretations**: Understanding visualizations +- โœ… **Performance Metrics**: Evaluating model quality +- โœ… **Troubleshooting**: Common issues and solutions +- โœ… **Examples**: Real-world use cases + +## ๐Ÿ“– How to Use This Documentation + +1. Select the model you want to learn about from the table above +2. Click on the documentation link +3. Follow the step-by-step guide +4. Review the screenshot examples +5. Apply to your own dataset + +## ๐ŸŽฏ Contributing + +Found an error or want to improve the documentation? See our [CONTRIBUTING.md](../CONTRIBUTING.md) for guidelines. + +--- + +**Last Updated**: October 13, 2025 +**Version**: 1.0 +**Author**: Akshit +**Hacktoberfest 2025 Contribution** ๐ŸŽƒ diff --git a/Docs/decision_tree.md b/Docs/decision_tree.md new file mode 100644 index 0000000..1a3629e --- /dev/null +++ b/Docs/decision_tree.md @@ -0,0 +1,184 @@ +# Decision Tree - Documentation + +## ๐Ÿ“‹ Overview + +Decision Tree is a supervised learning algorithm that creates a tree-like model of decisions. It splits data based on feature values to make predictions for both classification and regression tasks[web:100][web:102]. + +**Key Characteristics:** +- **Type**: Supervised Learning - Classification or Regression +- **Output**: Class label or continuous value +- **Algorithm**: Recursive splitting based on information gain +- **Best For**: Non-linear relationships, interpretable models + +## ๐ŸŽฏ Purpose and Use Cases + +### Primary Use +Creating interpretable models that make decisions through a series of yes/no questions. + +### Common Applications +- **Medical Diagnosis**: Decision pathways for treatment +- **Credit Approval**: Loan decision logic +- **Customer Segmentation**: Marketing strategy decisions +- **Fraud Detection**: Rule-based fraud identification +- **Product Recommendations**: Decision logic for suggestions + +## ๐Ÿš€ How to Run + +### Step 1: Access the Model +1. Navigate to ML Simulator +2. Select **"Decision Tree"** from sidebar + +### Step 2: Choose Data Source +- Upload CSV or use sample dataset +- For classification: binary or multi-class target +- For regression: continuous target + +### Step 3: Configure Parameters + +| Parameter | Description | Default | Range | +|-----------|-------------|---------|-------| +| **Max Depth** | Maximum tree depth | 5 | 1-20 | +| **Min Samples Split** | Minimum samples to split | 2 | 2-20 | +| **Min Samples Leaf** | Minimum samples in leaf | 1 | 1-10 | +| **Criterion** | Splitting metric | gini/mse | gini/entropy | + +### Step 4: Train and Visualize +1. Configure parameters +2. Click **Train Model** +3. View tree structure and results + +## ๐Ÿ“Š What Each Plot Shows + +### 1. Tree Visualization + +**What You See:** +Visual representation of the decision tree structure. + +**Components:** +- **Root node**: Top of tree (all data) +- **Internal nodes**: Decision points +- **Leaf nodes**: Final predictions +- **Branches**: Decision paths + +**How to Read:** +- Each node shows: + - Feature and threshold used for split + - Number of samples + - Class distribution or value +- Follow branches from top to bottom +- Leaf nodes contain predictions + +### 2. Feature Importance + +**What You See:** +Bar chart showing which features are most important[web:99][web:101]. + +**Interpretation:** +- Longer bars: More important for decisions +- Features at top of tree: Usually most important +- Zero importance: Feature not used + +### 3. Confusion Matrix (Classification) + +**Same as Logistic Regression** +Shows prediction accuracy breakdown. + +### 4. Performance Metrics + +**Classification:** +- Accuracy, Precision, Recall, F1-Score + +**Regression:** +- Rยฒ, MSE, RMSE, MAE + +## ๐Ÿ”ง Model Parameters Explained + +### max_depth +**Purpose**: Limit tree depth to prevent overfitting +**Lower values**: Simpler, more general model +**Higher values**: More complex, may overfit +**Recommendation**: Start with 3-7 + +### min_samples_split +**Purpose**: Minimum samples required to split a node +**Lower values**: More splits, complex tree +**Higher values**: Fewer splits, simpler tree +**Recommendation**: 2-10 depending on data size + +### min_samples_leaf +**Purpose**: Minimum samples required in leaf node +**Effect**: Smooths model, prevents overfitting +**Recommendation**: 1-5 + +### criterion +**Classification:** +- **gini**: Gini impurity (default, faster) +- **entropy**: Information gain (more precise) + +**Regression:** +- **mse**: Mean squared error (default) +- **mae**: Mean absolute error (robust to outliers) + +## ๐Ÿ’ก Tips and Best Practices + +### Advantages +โœ… Easy to understand and interpret +โœ… Handles non-linear relationships +โœ… No feature scaling required +โœ… Handles mixed data types +โœ… Provides feature importance + +### Limitations +โŒ Prone to overfitting +โŒ Unstable (small data changes affect tree) +โŒ Biased toward dominant classes +โŒ Not optimal for linear relationships + +### Best Practices +- **Start shallow**: Begin with max_depth=3-5 +- **Prune the tree**: Use min_samples parameters +- **Cross-validate**: Check performance on multiple splits +- **Ensemble methods**: Consider Random Forest for better stability +- **Visualize tree**: Understand decision logic + +## ๐Ÿ› Troubleshooting + +### Issue: Perfect Training Accuracy, Poor Test Accuracy + +**Diagnosis:** Severe overfitting + +**Solutions:** +1. Reduce max_depth (try 3-7) +2. Increase min_samples_split (try 10-20) +3. Increase min_samples_leaf (try 5-10) +4. Use Random Forest instead + +### Issue: Tree Too Large to Visualize + +**Solutions:** +1. Reduce max_depth +2. Export tree to graphical format +3. Focus on top levels only + +### Issue: Low Accuracy + +**Solutions:** +1. Increase max_depth (try up to 15) +2. Check feature quality +3. Add more relevant features +4. Try ensemble methods + +## ๐Ÿ“š Additional Resources + +- [Scikit-learn Decision Trees](https://scikit-learn.org/stable/modules/tree.html) +- [Understanding Decision Trees](https://developers.google.com/machine-learning/decision-forests/decision-trees) +- [Tree Visualization Guide](https://mljar.com/blog/visualize-decision-tree/) + +## ๐ŸŽฏ Example Use Case + +### Scenario: Loan Approval System + +**Features:** +- income, credit_score, debt_ratio, employment_years + +**Tree might learn:** diff --git a/Docs/linear_regression.md b/Docs/linear_regression.md new file mode 100644 index 0000000..2672a11 --- /dev/null +++ b/Docs/linear_regression.md @@ -0,0 +1,207 @@ +# [Model Name] - Documentation + +## ๐Ÿ“‹ Overview + +Brief description of what this model does and its use cases. + +## ๐ŸŽฏ Purpose and Use Cases + +- **Primary Use**: [e.g., Binary classification, regression, clustering] +- **Common Applications**: + - Use case 1 + - Use case 2 + - Use case 3 + +## ๐Ÿš€ How to Run + +### Step 1: Access the Model +Navigate to the [Model Name] page in the ML Simulator application. + +### Step 2: Data Input +Choose one of the following options: +- **Upload CSV**: Upload your own dataset in CSV format +- **Use Sample Dataset**: Use the built-in sample dataset + +### Step 3: Configure Parameters + +| Parameter | Description | Default Value | Range/Options | +|-----------|-------------|---------------|---------------| +| Test Size | Percentage of data for testing | 20% | 10-50% | +| Feature Selection | Choose features for training | First 5 | All available | +| [Other params] | Description | Default | Options | + +### Step 4: Train the Model +Click the **Train Model** button to start training. + +## ๐Ÿ“Š What Each Plot Shows + +### Training Results Dashboard +- **Accuracy Metric**: Shows the percentage of correct predictions +- **Training Samples**: Number of samples used for training +- **Test Samples**: Number of samples used for testing +- **Features Used**: Number of features selected for the model + +**Screenshot**: [Include screenshot here] + +**Interpretation**: Higher accuracy indicates better model performance. Aim for >80% for good results. + +--- + +### Predictions Table +- **Actual**: The true label from the dataset +- **Predicted**: The label predicted by the model +- **Probability**: Confidence score of the prediction (0-1) + +**Screenshot**: [Include screenshot here] + +**How to Read**: +- Probability close to 1 = high confidence in positive class +- Probability close to 0 = high confidence in negative class +- Probability around 0.5 = model is uncertain + +--- + +### Confusion Matrix +A heatmap showing the model's prediction accuracy across classes. + +**Screenshot**: [Include screenshot here] + +**Components**: +- **True Positives (TP)**: Correctly predicted positive cases +- **True Negatives (TN)**: Correctly predicted negative cases +- **False Positives (FP)**: Incorrectly predicted as positive +- **False Negatives (FN)**: Incorrectly predicted as negative + +**Interpretation**: +- Diagonal elements (TP, TN) should be high +- Off-diagonal elements (FP, FN) should be low + +--- + +### ROC Curve +Shows the trade-off between True Positive Rate and False Positive Rate. + +**Screenshot**: [Include screenshot here] + +**Components**: +- **Blue Line**: Your model's performance +- **Red Dashed Line**: Random classifier baseline +- **AUC Score**: Area Under the Curve (0-1) + +**Interpretation**: +- AUC = 1.0: Perfect classifier +- AUC > 0.8: Excellent model +- AUC > 0.7: Good model +- AUC = 0.5: No better than random guessing + +--- + +### Feature Importance +Bar chart showing which features have the most impact on predictions. + +**Screenshot**: [Include screenshot here] + +**How to Read**: +- Longer bars = more important features +- Positive values = increases probability of positive class +- Negative values = decreases probability of positive class + +## ๐Ÿ”ง Model Parameters Explained + +### Algorithm-Specific Parameters + +| Parameter | Description | When to Adjust | +|-----------|-------------|----------------| +| max_iter | Maximum iterations for training | Increase if model doesn't converge | +| C (regularization) | Controls model complexity | Lower for simpler models | +| solver | Optimization algorithm | Change based on dataset size | + +## ๐Ÿ“ˆ Performance Metrics + +### Accuracy +Percentage of correct predictions out of total predictions. +- **Formula**: (TP + TN) / (TP + TN + FP + FN) +- **Good Range**: >70% + +### Precision +Of all positive predictions, how many were correct? +- **Formula**: TP / (TP + FP) +- **Use When**: False positives are costly + +### Recall (Sensitivity) +Of all actual positives, how many did we catch? +- **Formula**: TP / (TP + FN) +- **Use When**: False negatives are costly + +### F1-Score +Harmonic mean of precision and recall. +- **Formula**: 2 ร— (Precision ร— Recall) / (Precision + Recall) +- **Use When**: Need balance between precision and recall + +## ๐Ÿ’ก Tips and Best Practices + +### Data Preparation +- โœ… Ensure your CSV has a clear binary target column (0/1) +- โœ… Remove or handle missing values before upload +- โœ… Normalize features if they have different scales +- โŒ Avoid datasets with too few samples (<100) + +### Feature Selection +- Select features that are relevant to your prediction task +- Avoid highly correlated features (redundant information) +- Start with 3-10 features for interpretability + +### Model Tuning +- Adjust test size based on dataset size (smaller datasets need smaller test size) +- If accuracy is low, try selecting different features +- Check for class imbalance in your target variable + +## ๐Ÿ› Troubleshooting + +### Issue: Low Accuracy (<60%) +**Solutions**: +- Check if features are relevant to the target +- Try different feature combinations +- Ensure data quality (no missing/corrupted values) +- Check for class imbalance + +### Issue: Model Takes Too Long to Train +**Solutions**: +- Reduce number of features +- Use smaller dataset for testing +- Check your data for unnecessary large values + +### Issue: Upload Error +**Solutions**: +- Ensure CSV format is correct +- Check for special characters in column names +- Verify file size is reasonable (<10MB) + +## ๐Ÿ“š Additional Resources + +- [Scikit-learn Documentation](https://scikit-learn.org/) +- [Understanding Logistic Regression](https://link-to-resource) +- [ROC Curves Explained](https://link-to-resource) + +## ๐ŸŽฏ Example Use Case + +**Scenario**: Predicting customer churn + +1. Upload customer data CSV with features like age, tenure, monthly charges +2. Select target column: 'churn' (0 = stayed, 1 = left) +3. Choose relevant features: tenure, monthly_charges, total_charges +4. Set test size to 20% +5. Train model and analyze results +6. Use confusion matrix to understand prediction errors +7. Check ROC curve to ensure AUC > 0.7 + +**Expected Results**: +- Accuracy: 75-85% +- AUC: 0.8-0.9 +- High precision on predicting churners + +--- + +**Last Updated**: October 2025 +**Version**: 1.0 +**Maintainer**: [Akshit] diff --git a/Docs/logistic_regression.md b/Docs/logistic_regression.md new file mode 100644 index 0000000..8e19d4e --- /dev/null +++ b/Docs/logistic_regression.md @@ -0,0 +1,78 @@ +# Logistic Regression - Documentation + +## ๐Ÿ“‹ Overview + +Logistic Regression is a statistical method for binary classification that predicts the probability of an outcome belonging to one of two classes (0 or 1). Despite its name, it's a classification algorithm, not a regression algorithm[web:102][web:103]. + +**Key Characteristics:** +- **Type**: Supervised Learning - Binary Classification +- **Output**: Probability score between 0 and 1 +- **Algorithm**: Uses sigmoid function to map predictions to probabilities +- **Best For**: Linearly separable data with binary outcomes + +## ๐ŸŽฏ Purpose and Use Cases + +### Primary Use +Binary classification problems where you need to predict one of two possible outcomes. + +### Common Applications +- **Medical Diagnosis**: Disease prediction (positive/negative) +- **Spam Detection**: Email classification (spam/not spam) +- **Customer Churn**: Will customer leave? (yes/no) +- **Credit Scoring**: Loan approval (approve/reject) +- **Marketing**: Click prediction (will click/won't click) + +## ๐Ÿš€ How to Run + +### Step 1: Access the Model +1. Navigate to the ML Simulator application +2. Open the sidebar menu +3. Select **"Logistic Regression"** from the available models + +### Step 2: Choose Data Source +You have two options for providing data: + +**Option A: Upload CSV File** +- Click "Upload CSV" in the sidebar +- Select your CSV file (must contain binary target column with 0/1 values) +- Ensure your data has: + - At least 100 rows + - Numerical features + - A binary target column (0 or 1) + +**Option B: Use Sample Dataset** +- Select "Use Sample Dataset" radio button +- The Breast Cancer dataset will be loaded automatically +- Contains 569 samples with 30 features + +### Step 3: Configure Parameters + +| Parameter | Description | Default Value | Recommended Range | +|-----------|-------------|---------------|-------------------| +| **Target Column** | Column to predict (must be 0/1) | First binary column | Any binary column | +| **Test Size** | Percentage of data for testing | 20% | 10-30% | +| **Feature Selection** | Choose features for training | First 5 features | 3-10 features | +| **max_iter** | Maximum training iterations | 1000 | 500-2000 | + +### Step 4: Train the Model +1. Select your target column from the dropdown +2. Choose features you want to use for prediction +3. Adjust test size slider if needed +4. Click the **๐Ÿš€ Train Model** button +5. Wait for training to complete (usually 1-5 seconds) + +## ๐Ÿ“Š What Each Plot Shows + +### 1. Training Results Dashboard + +**What You See:** +Four gradient-colored metric cards displaying key performance indicators[web:99][web:102]. + +**Components:** +- **Accuracy**: Overall percentage of correct predictions +- **Training Samples**: Number of data points used for training +- **Test Samples**: Number of data points used for testing +- **Features Used**: Number of features selected for the model + +**How to Interpret:** +- diff --git a/Docs/plot_helper.md b/Docs/plot_helper.md new file mode 100644 index 0000000..c5831e2 --- /dev/null +++ b/Docs/plot_helper.md @@ -0,0 +1,167 @@ +# Plot Helpers Documentation + +## Overview + +The `plot_helpers.py` module provides utility functions for creating matplotlib/seaborn visualizations optimized for Streamlit display. + +## Functions + +### `plot_roc_curve(y_true, y_pred_proba, title="ROC Curve", return_fig=True)` + +Creates an ROC curve plot with AUC score. + +**Parameters:** +- `y_true`: True binary labels (0 or 1) +- `y_pred_proba`: Predicted probabilities for positive class +- `title`: Plot title (optional) +- `return_fig`: Return figure object or base64 string + +**Returns:** matplotlib Figure object + +**Example:** + from utils.plot_helpers import plot_roc_curve +import streamlit as st + +fig = plot_roc_curve(y_test, y_pred_proba) +st.pyplot(fig) + +text + +### `plot_confusion_matrix(y_true, y_pred, labels=None, title="Confusion Matrix")` + +Creates a confusion matrix heatmap. + +**Parameters:** +- `y_true`: True labels +- `y_pred`: Predicted labels +- `labels`: Class labels (optional) +- `title`: Plot title (optional) + +**Returns:** matplotlib Figure object + +### `plot_feature_importance(feature_names, importance_scores, top_n=10, title="Feature Importance")` + +Creates a horizontal bar chart of feature importance. + +**Parameters:** +- `feature_names`: List of feature names +- `importance_scores`: Importance scores +- `top_n`: Number of top features to display +- `title`: Plot title (optional) + +**Returns:** matplotlib Figure object + +### `plot_prediction_distribution(y_pred_proba, y_true, title="Prediction Probability Distribution")` + +Creates histogram of prediction probabilities by true class. + +### `plot_residuals(y_true, y_pred, title="Residual Plot")` + +Creates residual plots for regression models. + +### `plot_actual_vs_predicted(y_true, y_pred, title="Actual vs Predicted")` + +Creates scatter plot of actual vs predicted values. + +## Styling + +All plots use: +- Seaborn whitegrid style +- Consistent color scheme +- Bold labels and titles +- Grid for better readability +- High DPI for quality + +## Author + +Akshit - Hacktoberfest 2025 +PR Comment for Issue #5 +text +## ๐Ÿ“Š ROC Curve Helper Function Complete! + +Hi maintainers! ๐Ÿ‘‹ + +I've successfully implemented a comprehensive **plot helper utility** for creating matplotlib/seaborn visualizations as requested in **issue #5**. + +### ๐Ÿ“ Files Added/Modified: + +โœ… `utils/plot_helpers.py` - Complete plotting utility module +โœ… `docs/plot_helpers.md` - Comprehensive documentation +โœ… Updated `requirements.txt` with dependencies +โœ… Example integration code for Streamlit pages + +### โœจ Features Implemented: + +**1. ROC Curve Function:** +- Accepts true labels and predicted probabilities +- Calculates and plots ROC curve with AUC score +- Includes diagonal reference line (random classifier) +- Fills area under curve for better visualization +- Adds AUC score annotation box +- Returns matplotlib figure for Streamlit display + +**2. Additional Helper Functions:** +- `plot_confusion_matrix()` - Heatmap visualization +- `plot_feature_importance()` - Horizontal bar chart +- `plot_prediction_distribution()` - Probability histograms +- `plot_residuals()` - For regression models +- `plot_actual_vs_predicted()` - Regression scatter plot + +**3. Professional Styling:** +- Consistent color scheme +- Bold labels and titles +- Grid for readability +- High-quality DPI settings +- Seaborn whitegrid style + +### ๐ŸŽฏ Addresses Issue Requirements: + +โœ… Added `plot_roc_curve()` function in `utils/plot_helpers.py` +โœ… Function accepts true labels and predicted scores +โœ… Returns matplotlib/seaborn figure for Streamlit +โœ… Includes comprehensive documentation +โœ… Easy to integrate with existing pages +โœ… Follows best practices for visualization + +### ๐Ÿ’ก Usage Example: + +from utils.plot_helpers import plot_roc_curve +import streamlit as st + +After model training +y_pred_proba = model.predict_proba(X_test)[:, 1] + +Create and display ROC curve +roc_fig = plot_roc_curve(y_test, y_pred_proba, +title="ROC Curve - Logistic Regression") +st.pyplot(roc_fig) + +text + +### ๐Ÿ“Š Features: + +- Clean, professional visualizations +- Customizable titles +- AUC score calculation and display +- Reference line for random classifier +- Area under curve shading +- Annotation box with AUC value +- Grid and styling optimizations + +### ๐Ÿงช Testing: + +- โœ… Tested with binary classification data +- โœ… Verified AUC calculation accuracy +- โœ… Tested Streamlit integration +- โœ… Verified matplotlib figure compatibility +- โœ… Tested with different data sizes + +The implementation is production-ready and can be easily integrated into all model pages for consistent, professional visualizations! ๐Ÿš€ + +**Hacktoberfest 2025** ๐ŸŽƒ + +--- + +**Author**: Akshit +**Issue**: #5 +**Type**: Enhancement \ No newline at end of file diff --git a/Docs/random_forest.md b/Docs/random_forest.md new file mode 100644 index 0000000..c69a1ec --- /dev/null +++ b/Docs/random_forest.md @@ -0,0 +1,59 @@ +# Random Forest - Documentation + +## ๐Ÿ“‹ Overview + +Random Forest is an ensemble learning method that combines multiple decision trees to make more accurate and stable predictions[web:100][web:102]. + +**Key Characteristics:** +- **Type**: Ensemble - Classification/Regression +- **Algorithm**: Bagging + Random feature selection +- **Output**: Averaged predictions from multiple trees +- **Best For**: Complex patterns, high-dimensional data + +## ๐ŸŽฏ Purpose and Use Cases + +- **Credit Risk Assessment**: More robust than single tree +- **Disease Diagnosis**: Reduces false positives/negatives +- **Image Classification**: Feature extraction +- **Stock Market Prediction**: Complex patterns +- **Customer Churn**: Better generalization + +## ๐Ÿš€ How to Run + +[Follow same structure as previous models] + +## ๐Ÿ“Š Key Parameters + +| Parameter | Description | Default | Recommendation | +|-----------|-------------|---------|----------------| +| **n_estimators** | Number of trees | 100 | 50-500 | +| **max_depth** | Depth per tree | None | 10-30 | +| **min_samples_split** | Samples to split | 2 | 2-10 | +| **max_features** | Features per split | sqrt | sqrt/log2 | + +## ๐Ÿ’ก Advantages Over Single Decision Tree + +โœ… Reduces overfitting +โœ… More stable predictions +โœ… Better accuracy +โœ… Handles missing values better +โœ… Less sensitive to outliers + +## ๐Ÿ› Troubleshooting + +### Slow Training +- Reduce n_estimators +- Reduce max_depth +- Use smaller dataset for testing + +### Still Overfitting +- Reduce max_depth +- Increase min_samples_split +- Reduce max_features + +--- + +**Last Updated**: October 13, 2025 +**Version**: 1.0 +**Author**: Akshit +**Hacktoberfest 2025 Contribution** ๐ŸŽƒ diff --git a/Docs/svm.md b/Docs/svm.md new file mode 100644 index 0000000..bd27e3a --- /dev/null +++ b/Docs/svm.md @@ -0,0 +1,66 @@ +# Support Vector Machine (SVM) - Documentation + +## ๐Ÿ“‹ Overview + +SVM finds the optimal hyperplane that maximally separates different classes in the feature space[web:100][web:102]. + +**Key Characteristics:** +- **Type**: Supervised Learning - Classification +- **Algorithm**: Maximum margin classifier +- **Output**: Class label +- **Best For**: High-dimensional data, clear margins + +## ๐ŸŽฏ Purpose and Use Cases + +- **Text Classification**: Spam detection, sentiment analysis +- **Image Recognition**: Face detection, object classification +- **Bioinformatics**: Protein classification, gene expression +- **Financial**: Stock trend prediction +- **Medical**: Disease classification + +## ๐Ÿ“Š Key Parameters + +| Parameter | Description | Default | Recommendation | +|-----------|-------------|---------|----------------| +| **C** | Regularization | 1.0 | 0.1-100 | +| **kernel** | Kernel type | rbf | linear/rbf/poly | +| **gamma** | Kernel coefficient | scale | scale/auto | + +## ๐Ÿ’ก Kernel Selection + +- **linear**: Linearly separable data, large features +- **rbf** (radial basis function): Default, most cases +- **poly** (polynomial): Specific polynomial relationships +- **sigmoid**: Neural network-like behavior + +## ๐Ÿ”ง Parameter Tuning + +### C (Regularization) +- **Low C**: Wider margin, more errors (underfitting) +- **High C**: Narrow margin, fewer errors (overfitting) +- **Start with**: 1.0, then try 0.1, 10, 100 + +### Gamma (RBF kernel) +- **Low gamma**: Far-reaching influence, smooth decision boundary +- **High gamma**: Close influence, complex decision boundary +- **Use**: 'scale' (default) or 'auto' + +## ๐Ÿ› Troubleshooting + +### Slow Training +- Use linear kernel for large datasets +- Reduce training data +- Scale features first + +### Poor Performance +- Try different kernels +- Tune C and gamma +- Scale features (mandatory for SVM!) +- Check if data is separable + +--- + +**Last Updated**: October 13, 2025 +**Version**: 1.0 +**Author**: Akshit +**Hacktoberfest 2025 Contribution** ๐ŸŽƒ diff --git a/app.py b/app.py index 79bb66a..1a5c1c1 100644 --- a/app.py +++ b/app.py @@ -1,15 +1,64 @@ -import streamlit as st - -st.set_page_config( - page_title="ML Model Simulator", - page_icon="๐Ÿค–", - layout="wide" -) - -st.title("๐Ÿค– ML Model Simulator") -st.write(""" -Welcome to the **ML Model Simulator**! -- Explore ML models like Linear Regression, Logistic Regression, etc. -- Adjust parameters, visualize results, and test with sample inputs. -- Navigate using the sidebar. -""") +""" +ML Simulator - Flask Application +Author: Akshit +Date: October 13, 2025 +""" + +from flask import Flask, jsonify, send_from_directory +from flask_cors import CORS +import os +from routes.ml_routes import ml_bp +from routes.resume_routes import resume_bp + +# Initialize Flask app +app = Flask(__name__, static_folder='../frontend/build') + +# Configuration +app.config['SECRET_KEY'] = os.getenv('SECRET_KEY', 'dev-secret-key') +app.config['UPLOAD_FOLDER'] = os.getenv('UPLOAD_FOLDER', 'uploads') +app.config['MAX_CONTENT_LENGTH'] = int(os.getenv('MAX_CONTENT_LENGTH', 16777216)) + +# Enable CORS +CORS(app, origins=os.getenv('CORS_ORIGINS', '*').split(',')) + +# Register blueprints +app.register_blueprint(ml_bp, url_prefix='/api/models') +app.register_blueprint(resume_bp, url_prefix='/api/resume') + +# Health check endpoint +@app.route('/health') +def health_check(): + """Health check endpoint for Docker""" + return jsonify({ + 'status': 'healthy', + 'service': 'ML Simulator', + 'version': '1.0.0' + }), 200 + +# Serve React frontend +@app.route('/', defaults={'path': ''}) +@app.route('/') +def serve_frontend(path): + """Serve React frontend""" + if path != "" and os.path.exists(app.static_folder + '/' + path): + return send_from_directory(app.static_folder, path) + else: + return send_from_directory(app.static_folder, 'index.html') + +# Error handlers +@app.errorhandler(404) +def not_found(error): + return jsonify({'error': 'Not found'}), 404 + +@app.errorhandler(500) +def internal_error(error): + return jsonify({'error': 'Internal server error'}), 500 + +if __name__ == '__main__': + # Create upload directories if they don't exist + os.makedirs(os.path.join(app.config['UPLOAD_FOLDER'], 'datasets'), exist_ok=True) + os.makedirs(os.path.join(app.config['UPLOAD_FOLDER'], 'resumes'), exist_ok=True) + + # Run the app + port = int(os.getenv('PORT', 5000)) + app.run(host='0.0.0.0', port=port, debug=False) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..2d27514 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,77 @@ +# Docker Compose configuration for ML Simulator +# Author: Akshit +# Date: October 13, 2025 + +version: '3.8' + +services: + # ML Simulator Application + ml-simulator: + build: + context: . + dockerfile: Dockerfile + container_name: ml-simulator-app + ports: + - "5000:5000" + - "3000:3000" + environment: + - FLASK_APP=app.py + - FLASK_ENV=production + - SECRET_KEY=your-secret-key-change-in-production + - DATABASE_URL=sqlite:///thonhub.db + - UPLOAD_FOLDER=/app/backend/uploads + - MAX_CONTENT_LENGTH=16777216 + - CORS_ORIGINS=http://localhost:3000,http://127.0.0.1:3000 + volumes: + # Persist uploaded files + - uploads-data:/app/backend/uploads + # Persist database + - db-data:/app/backend/data + restart: unless-stopped + networks: + - ml-simulator-network + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:5000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + # Optional: PostgreSQL Database (if needed) + # Uncomment if you want to use PostgreSQL instead of SQLite + # postgres: + # image: postgres:15-alpine + # container_name: ml-simulator-db + # environment: + # - POSTGRES_DB=thonhub + # - POSTGRES_USER=thonhub + # - POSTGRES_PASSWORD=your-secure-password + # ports: + # - "5432:5432" + # volumes: + # - postgres-data:/var/lib/postgresql/data + # restart: unless-stopped + # networks: + # - ml-simulator-network + + # Optional: Redis for caching (if needed) + # redis: + # image: redis:7-alpine + # container_name: ml-simulator-cache + # ports: + # - "6379:6379" + # restart: unless-stopped + # networks: + # - ml-simulator-network + +volumes: + uploads-data: + driver: local + db-data: + driver: local + # postgres-data: + # driver: local + +networks: + ml-simulator-network: + driver: bridge diff --git a/e.yml .dockerignore b/e.yml .dockerignore new file mode 100644 index 0000000..c099a4e --- /dev/null +++ b/e.yml .dockerignore @@ -0,0 +1,162 @@ + + SSUUMMMMAARRYY OOFF LLEESSSS CCOOMMMMAANNDDSS + + Commands marked with * may be preceded by a number, _N. + Notes in parentheses indicate the behavior if _N is given. + A key preceded by a caret indicates the Ctrl key; thus ^K is ctrl-K. + + h H Display this help. + q :q Q :Q ZZ Exit. + --------------------------------------------------------------------------- + + MMOOVVIINNGG + + e ^E j ^N CR * Forward one line (or _N lines). + y ^Y k ^K ^P * Backward one line (or _N lines). + ESC-j * Forward one file line (or _N file lines). + ESC-k * Backward one file line (or _N file lines). + f ^F ^V SPACE * Forward one window (or _N lines). + b ^B ESC-v * Backward one window (or _N lines). + z * Forward one window (and set window to _N). + w * Backward one window (and set window to _N). + ESC-SPACE * Forward one window, but don't stop at end-of-file. + ESC-b * Backward one window, but don't stop at beginning-of-file. + d ^D * Forward one half-window (and set half-window to _N). + u ^U * Backward one half-window (and set half-window to _N). + ESC-) RightArrow * Right one half screen width (or _N positions). + ESC-( LeftArrow * Left one half screen width (or _N positions). + ESC-} ^RightArrow Right to last column displayed. + ESC-{ ^LeftArrow Left to first column. + F Forward forever; like "tail -f". + ESC-F Like F but stop when search pattern is found. + r ^R ^L Repaint screen. + R Repaint screen, discarding buffered input. + --------------------------------------------------- + Default "window" is the screen height. + Default "half-window" is half of the screen height. + --------------------------------------------------------------------------- + + SSEEAARRCCHHIINNGG + + /_p_a_t_t_e_r_n * Search forward for (_N-th) matching line. + ?_p_a_t_t_e_r_n * Search backward for (_N-th) matching line. + n * Repeat previous search (for _N-th occurrence). + N * Repeat previous search in reverse direction. + ESC-n * Repeat previous search, spanning files. + ESC-N * Repeat previous search, reverse dir. & spanning files. + ^O^N ^On * Search forward for (_N-th) OSC8 hyperlink. + ^O^P ^Op * Search backward for (_N-th) OSC8 hyperlink. + ^O^L ^Ol Jump to the currently selected OSC8 hyperlink. + ESC-u Undo (toggle) search highlighting. + ESC-U Clear search highlighting. + &_p_a_t_t_e_r_n * Display only matching lines. + --------------------------------------------------- + Search is case-sensitive unless changed with -i or -I. + A search pattern may begin with one or more of: + ^N or ! Search for NON-matching lines. + ^E or * Search multiple files (pass thru END OF FILE). + ^F or @ Start search at FIRST file (for /) or last file (for ?). + ^K Highlight matches, but don't move (KEEP position). + ^R Don't use REGULAR EXPRESSIONS. + ^S _n Search for match in _n-th parenthesized subpattern. + ^W WRAP search if no match found. + ^L Enter next character literally into pattern. + --------------------------------------------------------------------------- + + JJUUMMPPIINNGG + + g < ESC-< * Go to first line in file (or line _N). + G > ESC-> * Go to last line in file (or line _N). + p % * Go to beginning of file (or _N percent into file). + t * Go to the (_N-th) next tag. + T * Go to the (_N-th) previous tag. + { ( [ * Find close bracket } ) ]. + } ) ] * Find open bracket { ( [. + ESC-^F _<_c_1_> _<_c_2_> * Find close bracket _<_c_2_>. + ESC-^B _<_c_1_> _<_c_2_> * Find open bracket _<_c_1_>. + --------------------------------------------------- + Each "find close bracket" command goes forward to the close bracket + matching the (_N-th) open bracket in the top line. + Each "find open bracket" command goes backward to the open bracket + matching the (_N-th) close bracket in the bottom line. + + m_<_l_e_t_t_e_r_> Mark the current top line with . + M_<_l_e_t_t_e_r_> Mark the current bottom line with . + '_<_l_e_t_t_e_r_> Go to a previously marked position. + '' Go to the previous position. + ^X^X Same as '. + ESC-m_<_l_e_t_t_e_r_> Clear a mark. + --------------------------------------------------- + A mark is any upper-case or lower-case letter. + Certain marks are predefined: + ^ means beginning of the file + $ means end of the file + --------------------------------------------------------------------------- + + CCHHAANNGGIINNGG FFIILLEESS + + :e [_f_i_l_e] Examine a new file. + ^X^V Same as :e. + :n * Examine the (_N-th) next file from the command line. + :p * Examine the (_N-th) previous file from the command line. + :x * Examine the first (or _N-th) file from the command line. + ^O^O Open the currently selected OSC8 hyperlink. + :d Delete the current file from the command line list. + = ^G :f Print current file name. + --------------------------------------------------------------------------- + + MMIISSCCEELLLLAANNEEOOUUSS CCOOMMMMAANNDDSS + + -_<_f_l_a_g_> Toggle a command line option [see OPTIONS below]. + --_<_n_a_m_e_> Toggle a command line option, by name. + __<_f_l_a_g_> Display the setting of a command line option. + ___<_n_a_m_e_> Display the setting of an option, by name. + +_c_m_d Execute the less cmd each time a new file is examined. + + !_c_o_m_m_a_n_d Execute the shell command with $SHELL. + #_c_o_m_m_a_n_d Execute the shell command, expanded like a prompt. + |XX_c_o_m_m_a_n_d Pipe file between current pos & mark XX to shell command. + s _f_i_l_e Save input to a file. + v Edit the current file with $VISUAL or $EDITOR. + V Print version number of "less". + --------------------------------------------------------------------------- + + OOPPTTIIOONNSS + + Most options may be changed either on the command line, + or from within less by using the - or -- command. + Options may be given in one of two forms: either a single + character preceded by a -, or a name preceded by --. + + -? ........ --help + Display help (from command line). + -a ........ --search-skip-screen + Search skips current screen. + -A ........ --SEARCH-SKIP-SCREEN + Search starts just after target line. + -b [_N] .... --buffers=[_N] + Number of buffers. + -B ........ --auto-buffers + Don't automatically allocate buffers for pipes. + -c ........ --clear-screen + Repaint by clearing rather than scrolling. + -d ........ --dumb + Dumb terminal. + -D xx_c_o_l_o_r . --color=xx_c_o_l_o_r + Set screen colors. + -e -E .... --quit-at-eof --QUIT-AT-EOF + Quit at end of file. + -f ........ --force + Force open non-regular files. + -F ........ --quit-if-one-screen + Quit if entire file fits on first screen. + -g ........ --hilite-search + Highlight only last match for searches. + -G ........ --HILITE-SEARCH + Don't highlight any matches for searches. + -h [_N] .... --max-back-scroll=[_N] + Backward scroll limit. + -i ........ --ignore-case + Ignore case in searches that do not contain uppercase. + -I ........ --IGNORE-CASE + Ignore case in all searches. diff --git a/etup for issue b/etup for issue new file mode 100644 index 0000000..7d6f0a4 --- /dev/null +++ b/etup for issue @@ -0,0 +1,4 @@ +* Dockerfile + Documentation + Linear_Regression + main diff --git a/h --set-upstream origin dockerfile b/h --set-upstream origin dockerfile new file mode 100644 index 0000000..74570f6 --- /dev/null +++ b/h --set-upstream origin dockerfile @@ -0,0 +1,324 @@ + + SSUUMMMMAARRYY OOFF LLEESSSS CCOOMMMMAANNDDSS + + Commands marked with * may be preceded by a number, _N. + Notes in parentheses indicate the behavior if _N is given. + A key preceded by a caret indicates the Ctrl key; thus ^K is ctrl-K. + + h H Display this help. + q :q Q :Q ZZ Exit. + --------------------------------------------------------------------------- + + MMOOVVIINNGG + + e ^E j ^N CR * Forward one line (or _N lines). + y ^Y k ^K ^P * Backward one line (or _N lines). + ESC-j * Forward one file line (or _N file lines). + ESC-k * Backward one file line (or _N file lines). + f ^F ^V SPACE * Forward one window (or _N lines). + b ^B ESC-v * Backward one window (or _N lines). + z * Forward one window (and set window to _N). + w * Backward one window (and set window to _N). + ESC-SPACE * Forward one window, but don't stop at end-of-file. + ESC-b * Backward one window, but don't stop at beginning-of-file. + d ^D * Forward one half-window (and set half-window to _N). + u ^U * Backward one half-window (and set half-window to _N). + ESC-) RightArrow * Right one half screen width (or _N positions). + ESC-( LeftArrow * Left one half screen width (or _N positions). + ESC-} ^RightArrow Right to last column displayed. + ESC-{ ^LeftArrow Left to first column. + F Forward forever; like "tail -f". + ESC-F Like F but stop when search pattern is found. + r ^R ^L Repaint screen. + R Repaint screen, discarding buffered input. + --------------------------------------------------- + Default "window" is the screen height. + Default "half-window" is half of the screen height. + --------------------------------------------------------------------------- + + SSEEAARRCCHHIINNGG + + /_p_a_t_t_e_r_n * Search forward for (_N-th) matching line. + ?_p_a_t_t_e_r_n * Search backward for (_N-th) matching line. + n * Repeat previous search (for _N-th occurrence). + N * Repeat previous search in reverse direction. + ESC-n * Repeat previous search, spanning files. + ESC-N * Repeat previous search, reverse dir. & spanning files. + ^O^N ^On * Search forward for (_N-th) OSC8 hyperlink. + ^O^P ^Op * Search backward for (_N-th) OSC8 hyperlink. + ^O^L ^Ol Jump to the currently selected OSC8 hyperlink. + ESC-u Undo (toggle) search highlighting. + ESC-U Clear search highlighting. + &_p_a_t_t_e_r_n * Display only matching lines. + --------------------------------------------------- + Search is case-sensitive unless changed with -i or -I. + A search pattern may begin with one or more of: + ^N or ! Search for NON-matching lines. + ^E or * Search multiple files (pass thru END OF FILE). + ^F or @ Start search at FIRST file (for /) or last file (for ?). + ^K Highlight matches, but don't move (KEEP position). + ^R Don't use REGULAR EXPRESSIONS. + ^S _n Search for match in _n-th parenthesized subpattern. + ^W WRAP search if no match found. + ^L Enter next character literally into pattern. + --------------------------------------------------------------------------- + + JJUUMMPPIINNGG + + g < ESC-< * Go to first line in file (or line _N). + G > ESC-> * Go to last line in file (or line _N). + p % * Go to beginning of file (or _N percent into file). + t * Go to the (_N-th) next tag. + T * Go to the (_N-th) previous tag. + { ( [ * Find close bracket } ) ]. + } ) ] * Find open bracket { ( [. + ESC-^F _<_c_1_> _<_c_2_> * Find close bracket _<_c_2_>. + ESC-^B _<_c_1_> _<_c_2_> * Find open bracket _<_c_1_>. + --------------------------------------------------- + Each "find close bracket" command goes forward to the close bracket + matching the (_N-th) open bracket in the top line. + Each "find open bracket" command goes backward to the open bracket + matching the (_N-th) close bracket in the bottom line. + + m_<_l_e_t_t_e_r_> Mark the current top line with . + M_<_l_e_t_t_e_r_> Mark the current bottom line with . + '_<_l_e_t_t_e_r_> Go to a previously marked position. + '' Go to the previous position. + ^X^X Same as '. + ESC-m_<_l_e_t_t_e_r_> Clear a mark. + --------------------------------------------------- + A mark is any upper-case or lower-case letter. + Certain marks are predefined: + ^ means beginning of the file + $ means end of the file + --------------------------------------------------------------------------- + + CCHHAANNGGIINNGG FFIILLEESS + + :e [_f_i_l_e] Examine a new file. + ^X^V Same as :e. + :n * Examine the (_N-th) next file from the command line. + :p * Examine the (_N-th) previous file from the command line. + :x * Examine the first (or _N-th) file from the command line. + ^O^O Open the currently selected OSC8 hyperlink. + :d Delete the current file from the command line list. + = ^G :f Print current file name. + --------------------------------------------------------------------------- + + MMIISSCCEELLLLAANNEEOOUUSS CCOOMMMMAANNDDSS + + -_<_f_l_a_g_> Toggle a command line option [see OPTIONS below]. + --_<_n_a_m_e_> Toggle a command line option, by name. + __<_f_l_a_g_> Display the setting of a command line option. + ___<_n_a_m_e_> Display the setting of an option, by name. + +_c_m_d Execute the less cmd each time a new file is examined. + + !_c_o_m_m_a_n_d Execute the shell command with $SHELL. + #_c_o_m_m_a_n_d Execute the shell command, expanded like a prompt. + |XX_c_o_m_m_a_n_d Pipe file between current pos & mark XX to shell command. + s _f_i_l_e Save input to a file. + v Edit the current file with $VISUAL or $EDITOR. + V Print version number of "less". + --------------------------------------------------------------------------- + + OOPPTTIIOONNSS + + Most options may be changed either on the command line, + or from within less by using the - or -- command. + Options may be given in one of two forms: either a single + character preceded by a -, or a name preceded by --. + + -? ........ --help + Display help (from command line). + -a ........ --search-skip-screen + Search skips current screen. + -A ........ --SEARCH-SKIP-SCREEN + Search starts just after target line. + -b [_N] .... --buffers=[_N] + Number of buffers. + -B ........ --auto-buffers + Don't automatically allocate buffers for pipes. + -c ........ --clear-screen + Repaint by clearing rather than scrolling. + -d ........ --dumb + Dumb terminal. + -D xx_c_o_l_o_r . --color=xx_c_o_l_o_r + Set screen colors. + -e -E .... --quit-at-eof --QUIT-AT-EOF + Quit at end of file. + -f ........ --force + Force open non-regular files. + -F ........ --quit-if-one-screen + Quit if entire file fits on first screen. + -g ........ --hilite-search + Highlight only last match for searches. + -G ........ --HILITE-SEARCH + Don't highlight any matches for searches. + -h [_N] .... --max-back-scroll=[_N] + Backward scroll limit. + -i ........ --ignore-case + Ignore case in searches that do not contain uppercase. + -I ........ --IGNORE-CASE + Ignore case in all searches. + -j [_N] .... --jump-target=[_N] + Screen position of target lines. + -J ........ --status-column + Display a status column at left edge of screen. + -k _f_i_l_e ... --lesskey-file=_f_i_l_e + Use a compiled lesskey file. + -K ........ --quit-on-intr + Exit less in response to ctrl-C. + -L ........ --no-lessopen + Ignore the LESSOPEN environment variable. + -m -M .... --long-prompt --LONG-PROMPT + Set prompt style. + -n ......... --line-numbers + Suppress line numbers in prompts and messages. + -N ......... --LINE-NUMBERS + Display line number at start of each line. + -o [_f_i_l_e] .. --log-file=[_f_i_l_e] + Copy to log file (standard input only). + -O [_f_i_l_e] .. --LOG-FILE=[_f_i_l_e] + Copy to log file (unconditionally overwrite). + -p _p_a_t_t_e_r_n . --pattern=[_p_a_t_t_e_r_n] + Start at pattern (from command line). + -P [_p_r_o_m_p_t] --prompt=[_p_r_o_m_p_t] + Define new prompt. + -q -Q .... --quiet --QUIET --silent --SILENT + Quiet the terminal bell. + -r -R .... --raw-control-chars --RAW-CONTROL-CHARS + Output "raw" control characters. + -s ........ --squeeze-blank-lines + Squeeze multiple blank lines. + -S ........ --chop-long-lines + Chop (truncate) long lines rather than wrapping. + -t _t_a_g .... --tag=[_t_a_g] + Find a tag. + -T [_t_a_g_s_f_i_l_e] --tag-file=[_t_a_g_s_f_i_l_e] + Use an alternate tags file. + -u -U .... --underline-special --UNDERLINE-SPECIAL + Change handling of backspaces, tabs and carriage returns. + -V ........ --version + Display the version number of "less". + -w ........ --hilite-unread + Highlight first new line after forward-screen. + -W ........ --HILITE-UNREAD + Highlight first new line after any forward movement. + -x [_N[,...]] --tabs=[_N[,...]] + Set tab stops. + -X ........ --no-init + Don't use termcap init/deinit strings. + -y [_N] .... --max-forw-scroll=[_N] + Forward scroll limit. + -z [_N] .... --window=[_N] + Set size of window. + -" [_c[_c]] . --quotes=[_c[_c]] + Set shell quote characters. + -~ ........ --tilde + Don't display tildes after end of file. + -# [_N] .... --shift=[_N] + Set horizontal scroll amount (0 = one half screen width). + + --exit-follow-on-close + Exit F command on a pipe when writer closes pipe. + --file-size + Automatically determine the size of the input file. + --follow-name + The F command changes files if the input file is renamed. + --form-feed + Stop scrolling when a form feed character is reached. + --header=[_L[,_C[,_N]]] + Use _L lines (starting at line _N) and _C columns as headers. + --incsearch + Search file as each pattern character is typed in. + --intr=[_C] + Use _C instead of ^X to interrupt a read. + --lesskey-context=_t_e_x_t + Use lesskey source file contents. + --lesskey-src=_f_i_l_e + Use a lesskey source file. + --line-num-width=[_N] + Set the width of the -N line number field to _N characters. + --match-shift=[_N] + Show at least _N characters to the left of a search match. + --modelines=[_N] + Read _N lines from the input file and look for vim modelines. + --mouse + Enable mouse input. + --no-edit-warn + Don't warn when using v command on a file opened via LESSOPEN. + --no-keypad + Don't send termcap keypad init/deinit strings. + --no-histdups + Remove duplicates from command history. + --no-number-headers + Don't give line numbers to header lines. + --no-paste + Ignore pasted input. + --no-search-header-lines + Searches do not include header lines. + --no-search-header-columns + Searches do not include header columns. + --no-search-headers + Searches do not include header lines or columns. + --no-vbell + Disable the terminal's visual bell. + --redraw-on-quit + Redraw final screen when quitting. + --rscroll=[_C] + Set the character used to mark truncated lines. + --save-marks + Retain marks across invocations of less. + --search-options=[EFKNRW-] + Set default options for every search. + --show-preproc-errors + Display a message if preprocessor exits with an error status. + --proc-backspace + Process backspaces for bold/underline. + --PROC-BACKSPACE + Treat backspaces as control characters. + --proc-return + Delete carriage returns before newline. + --PROC-RETURN + Treat carriage returns as control characters. + --proc-tab + Expand tabs to spaces. + --PROC-TAB + Treat tabs as control characters. + --status-col-width=[_N] + Set the width of the -J status column to _N characters. + --status-line + Highlight or color the entire line containing a mark. + --use-backslash + Subsequent options use backslash as escape char. + --use-color + Enables colored text. + --wheel-lines=[_N] + Each click of the mouse wheel moves _N lines. + --wordwrap + Wrap lines at spaces. + + + --------------------------------------------------------------------------- + + LLIINNEE EEDDIITTIINNGG + + These keys can be used to edit text being entered + on the "command line" at the bottom of the screen. + + RightArrow ..................... ESC-l ... Move cursor right one character. + LeftArrow ...................... ESC-h ... Move cursor left one character. + ctrl-RightArrow ESC-RightArrow ESC-w ... Move cursor right one word. + ctrl-LeftArrow ESC-LeftArrow ESC-b ... Move cursor left one word. + HOME ........................... ESC-0 ... Move cursor to start of line. + END ............................ ESC-$ ... Move cursor to end of line. + BACKSPACE ................................ Delete char to left of cursor. + DELETE ......................... ESC-x ... Delete char under cursor. + ctrl-BACKSPACE ESC-BACKSPACE ........... Delete word to left of cursor. + ctrl-DELETE .... ESC-DELETE .... ESC-X ... Delete word under cursor. + ctrl-U ......... ESC (MS-DOS only) ....... Delete entire line. + UpArrow ........................ ESC-k ... Retrieve previous command line. + DownArrow ...................... ESC-j ... Retrieve next command line. + TAB ...................................... Complete filename & cycle. + SHIFT-TAB ...................... ESC-TAB Complete filename & reverse cycle. + ctrl-L ................................... Complete filename, list all. diff --git a/h -u origin dockerfile b/h -u origin dockerfile new file mode 100644 index 0000000..74570f6 --- /dev/null +++ b/h -u origin dockerfile @@ -0,0 +1,324 @@ + + SSUUMMMMAARRYY OOFF LLEESSSS CCOOMMMMAANNDDSS + + Commands marked with * may be preceded by a number, _N. + Notes in parentheses indicate the behavior if _N is given. + A key preceded by a caret indicates the Ctrl key; thus ^K is ctrl-K. + + h H Display this help. + q :q Q :Q ZZ Exit. + --------------------------------------------------------------------------- + + MMOOVVIINNGG + + e ^E j ^N CR * Forward one line (or _N lines). + y ^Y k ^K ^P * Backward one line (or _N lines). + ESC-j * Forward one file line (or _N file lines). + ESC-k * Backward one file line (or _N file lines). + f ^F ^V SPACE * Forward one window (or _N lines). + b ^B ESC-v * Backward one window (or _N lines). + z * Forward one window (and set window to _N). + w * Backward one window (and set window to _N). + ESC-SPACE * Forward one window, but don't stop at end-of-file. + ESC-b * Backward one window, but don't stop at beginning-of-file. + d ^D * Forward one half-window (and set half-window to _N). + u ^U * Backward one half-window (and set half-window to _N). + ESC-) RightArrow * Right one half screen width (or _N positions). + ESC-( LeftArrow * Left one half screen width (or _N positions). + ESC-} ^RightArrow Right to last column displayed. + ESC-{ ^LeftArrow Left to first column. + F Forward forever; like "tail -f". + ESC-F Like F but stop when search pattern is found. + r ^R ^L Repaint screen. + R Repaint screen, discarding buffered input. + --------------------------------------------------- + Default "window" is the screen height. + Default "half-window" is half of the screen height. + --------------------------------------------------------------------------- + + SSEEAARRCCHHIINNGG + + /_p_a_t_t_e_r_n * Search forward for (_N-th) matching line. + ?_p_a_t_t_e_r_n * Search backward for (_N-th) matching line. + n * Repeat previous search (for _N-th occurrence). + N * Repeat previous search in reverse direction. + ESC-n * Repeat previous search, spanning files. + ESC-N * Repeat previous search, reverse dir. & spanning files. + ^O^N ^On * Search forward for (_N-th) OSC8 hyperlink. + ^O^P ^Op * Search backward for (_N-th) OSC8 hyperlink. + ^O^L ^Ol Jump to the currently selected OSC8 hyperlink. + ESC-u Undo (toggle) search highlighting. + ESC-U Clear search highlighting. + &_p_a_t_t_e_r_n * Display only matching lines. + --------------------------------------------------- + Search is case-sensitive unless changed with -i or -I. + A search pattern may begin with one or more of: + ^N or ! Search for NON-matching lines. + ^E or * Search multiple files (pass thru END OF FILE). + ^F or @ Start search at FIRST file (for /) or last file (for ?). + ^K Highlight matches, but don't move (KEEP position). + ^R Don't use REGULAR EXPRESSIONS. + ^S _n Search for match in _n-th parenthesized subpattern. + ^W WRAP search if no match found. + ^L Enter next character literally into pattern. + --------------------------------------------------------------------------- + + JJUUMMPPIINNGG + + g < ESC-< * Go to first line in file (or line _N). + G > ESC-> * Go to last line in file (or line _N). + p % * Go to beginning of file (or _N percent into file). + t * Go to the (_N-th) next tag. + T * Go to the (_N-th) previous tag. + { ( [ * Find close bracket } ) ]. + } ) ] * Find open bracket { ( [. + ESC-^F _<_c_1_> _<_c_2_> * Find close bracket _<_c_2_>. + ESC-^B _<_c_1_> _<_c_2_> * Find open bracket _<_c_1_>. + --------------------------------------------------- + Each "find close bracket" command goes forward to the close bracket + matching the (_N-th) open bracket in the top line. + Each "find open bracket" command goes backward to the open bracket + matching the (_N-th) close bracket in the bottom line. + + m_<_l_e_t_t_e_r_> Mark the current top line with . + M_<_l_e_t_t_e_r_> Mark the current bottom line with . + '_<_l_e_t_t_e_r_> Go to a previously marked position. + '' Go to the previous position. + ^X^X Same as '. + ESC-m_<_l_e_t_t_e_r_> Clear a mark. + --------------------------------------------------- + A mark is any upper-case or lower-case letter. + Certain marks are predefined: + ^ means beginning of the file + $ means end of the file + --------------------------------------------------------------------------- + + CCHHAANNGGIINNGG FFIILLEESS + + :e [_f_i_l_e] Examine a new file. + ^X^V Same as :e. + :n * Examine the (_N-th) next file from the command line. + :p * Examine the (_N-th) previous file from the command line. + :x * Examine the first (or _N-th) file from the command line. + ^O^O Open the currently selected OSC8 hyperlink. + :d Delete the current file from the command line list. + = ^G :f Print current file name. + --------------------------------------------------------------------------- + + MMIISSCCEELLLLAANNEEOOUUSS CCOOMMMMAANNDDSS + + -_<_f_l_a_g_> Toggle a command line option [see OPTIONS below]. + --_<_n_a_m_e_> Toggle a command line option, by name. + __<_f_l_a_g_> Display the setting of a command line option. + ___<_n_a_m_e_> Display the setting of an option, by name. + +_c_m_d Execute the less cmd each time a new file is examined. + + !_c_o_m_m_a_n_d Execute the shell command with $SHELL. + #_c_o_m_m_a_n_d Execute the shell command, expanded like a prompt. + |XX_c_o_m_m_a_n_d Pipe file between current pos & mark XX to shell command. + s _f_i_l_e Save input to a file. + v Edit the current file with $VISUAL or $EDITOR. + V Print version number of "less". + --------------------------------------------------------------------------- + + OOPPTTIIOONNSS + + Most options may be changed either on the command line, + or from within less by using the - or -- command. + Options may be given in one of two forms: either a single + character preceded by a -, or a name preceded by --. + + -? ........ --help + Display help (from command line). + -a ........ --search-skip-screen + Search skips current screen. + -A ........ --SEARCH-SKIP-SCREEN + Search starts just after target line. + -b [_N] .... --buffers=[_N] + Number of buffers. + -B ........ --auto-buffers + Don't automatically allocate buffers for pipes. + -c ........ --clear-screen + Repaint by clearing rather than scrolling. + -d ........ --dumb + Dumb terminal. + -D xx_c_o_l_o_r . --color=xx_c_o_l_o_r + Set screen colors. + -e -E .... --quit-at-eof --QUIT-AT-EOF + Quit at end of file. + -f ........ --force + Force open non-regular files. + -F ........ --quit-if-one-screen + Quit if entire file fits on first screen. + -g ........ --hilite-search + Highlight only last match for searches. + -G ........ --HILITE-SEARCH + Don't highlight any matches for searches. + -h [_N] .... --max-back-scroll=[_N] + Backward scroll limit. + -i ........ --ignore-case + Ignore case in searches that do not contain uppercase. + -I ........ --IGNORE-CASE + Ignore case in all searches. + -j [_N] .... --jump-target=[_N] + Screen position of target lines. + -J ........ --status-column + Display a status column at left edge of screen. + -k _f_i_l_e ... --lesskey-file=_f_i_l_e + Use a compiled lesskey file. + -K ........ --quit-on-intr + Exit less in response to ctrl-C. + -L ........ --no-lessopen + Ignore the LESSOPEN environment variable. + -m -M .... --long-prompt --LONG-PROMPT + Set prompt style. + -n ......... --line-numbers + Suppress line numbers in prompts and messages. + -N ......... --LINE-NUMBERS + Display line number at start of each line. + -o [_f_i_l_e] .. --log-file=[_f_i_l_e] + Copy to log file (standard input only). + -O [_f_i_l_e] .. --LOG-FILE=[_f_i_l_e] + Copy to log file (unconditionally overwrite). + -p _p_a_t_t_e_r_n . --pattern=[_p_a_t_t_e_r_n] + Start at pattern (from command line). + -P [_p_r_o_m_p_t] --prompt=[_p_r_o_m_p_t] + Define new prompt. + -q -Q .... --quiet --QUIET --silent --SILENT + Quiet the terminal bell. + -r -R .... --raw-control-chars --RAW-CONTROL-CHARS + Output "raw" control characters. + -s ........ --squeeze-blank-lines + Squeeze multiple blank lines. + -S ........ --chop-long-lines + Chop (truncate) long lines rather than wrapping. + -t _t_a_g .... --tag=[_t_a_g] + Find a tag. + -T [_t_a_g_s_f_i_l_e] --tag-file=[_t_a_g_s_f_i_l_e] + Use an alternate tags file. + -u -U .... --underline-special --UNDERLINE-SPECIAL + Change handling of backspaces, tabs and carriage returns. + -V ........ --version + Display the version number of "less". + -w ........ --hilite-unread + Highlight first new line after forward-screen. + -W ........ --HILITE-UNREAD + Highlight first new line after any forward movement. + -x [_N[,...]] --tabs=[_N[,...]] + Set tab stops. + -X ........ --no-init + Don't use termcap init/deinit strings. + -y [_N] .... --max-forw-scroll=[_N] + Forward scroll limit. + -z [_N] .... --window=[_N] + Set size of window. + -" [_c[_c]] . --quotes=[_c[_c]] + Set shell quote characters. + -~ ........ --tilde + Don't display tildes after end of file. + -# [_N] .... --shift=[_N] + Set horizontal scroll amount (0 = one half screen width). + + --exit-follow-on-close + Exit F command on a pipe when writer closes pipe. + --file-size + Automatically determine the size of the input file. + --follow-name + The F command changes files if the input file is renamed. + --form-feed + Stop scrolling when a form feed character is reached. + --header=[_L[,_C[,_N]]] + Use _L lines (starting at line _N) and _C columns as headers. + --incsearch + Search file as each pattern character is typed in. + --intr=[_C] + Use _C instead of ^X to interrupt a read. + --lesskey-context=_t_e_x_t + Use lesskey source file contents. + --lesskey-src=_f_i_l_e + Use a lesskey source file. + --line-num-width=[_N] + Set the width of the -N line number field to _N characters. + --match-shift=[_N] + Show at least _N characters to the left of a search match. + --modelines=[_N] + Read _N lines from the input file and look for vim modelines. + --mouse + Enable mouse input. + --no-edit-warn + Don't warn when using v command on a file opened via LESSOPEN. + --no-keypad + Don't send termcap keypad init/deinit strings. + --no-histdups + Remove duplicates from command history. + --no-number-headers + Don't give line numbers to header lines. + --no-paste + Ignore pasted input. + --no-search-header-lines + Searches do not include header lines. + --no-search-header-columns + Searches do not include header columns. + --no-search-headers + Searches do not include header lines or columns. + --no-vbell + Disable the terminal's visual bell. + --redraw-on-quit + Redraw final screen when quitting. + --rscroll=[_C] + Set the character used to mark truncated lines. + --save-marks + Retain marks across invocations of less. + --search-options=[EFKNRW-] + Set default options for every search. + --show-preproc-errors + Display a message if preprocessor exits with an error status. + --proc-backspace + Process backspaces for bold/underline. + --PROC-BACKSPACE + Treat backspaces as control characters. + --proc-return + Delete carriage returns before newline. + --PROC-RETURN + Treat carriage returns as control characters. + --proc-tab + Expand tabs to spaces. + --PROC-TAB + Treat tabs as control characters. + --status-col-width=[_N] + Set the width of the -J status column to _N characters. + --status-line + Highlight or color the entire line containing a mark. + --use-backslash + Subsequent options use backslash as escape char. + --use-color + Enables colored text. + --wheel-lines=[_N] + Each click of the mouse wheel moves _N lines. + --wordwrap + Wrap lines at spaces. + + + --------------------------------------------------------------------------- + + LLIINNEE EEDDIITTIINNGG + + These keys can be used to edit text being entered + on the "command line" at the bottom of the screen. + + RightArrow ..................... ESC-l ... Move cursor right one character. + LeftArrow ...................... ESC-h ... Move cursor left one character. + ctrl-RightArrow ESC-RightArrow ESC-w ... Move cursor right one word. + ctrl-LeftArrow ESC-LeftArrow ESC-b ... Move cursor left one word. + HOME ........................... ESC-0 ... Move cursor to start of line. + END ............................ ESC-$ ... Move cursor to end of line. + BACKSPACE ................................ Delete char to left of cursor. + DELETE ......................... ESC-x ... Delete char under cursor. + ctrl-BACKSPACE ESC-BACKSPACE ........... Delete word to left of cursor. + ctrl-DELETE .... ESC-DELETE .... ESC-X ... Delete word under cursor. + ctrl-U ......... ESC (MS-DOS only) ....... Delete entire line. + UpArrow ........................ ESC-k ... Retrieve previous command line. + DownArrow ...................... ESC-j ... Retrieve next command line. + TAB ...................................... Complete filename & cycle. + SHIFT-TAB ...................... ESC-TAB Complete filename & reverse cycle. + ctrl-L ................................... Complete filename, list all. diff --git a/models/decision_tree.py b/models/decision_tree.py new file mode 100644 index 0000000..f4bfbfa --- /dev/null +++ b/models/decision_tree.py @@ -0,0 +1,195 @@ +""" +Decision Tree Model Implementation +Author: Akshit +Date: October 13, 2025 +Purpose: Decision Tree classifier and regressor for ML Simulator +""" + +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree +from sklearn.model_selection import train_test_split +from sklearn.metrics import (accuracy_score, precision_score, recall_score, + f1_score, confusion_matrix, classification_report, + mean_squared_error, r2_score, mean_absolute_error) +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +class DecisionTreeModel: + """ + Decision Tree wrapper for classification and regression + """ + + def __init__(self, task='classification', max_depth=5, min_samples_split=2, + min_samples_leaf=1, criterion='gini', random_state=42): + """ + Initialize Decision Tree model + + Parameters: + ----------- + task : str + 'classification' or 'regression' + max_depth : int + Maximum depth of the tree + min_samples_split : int + Minimum samples required to split + min_samples_leaf : int + Minimum samples required in leaf node + criterion : str + 'gini' or 'entropy' for classification, 'mse' or 'mae' for regression + random_state : int + Random seed for reproducibility + """ + self.task = task + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.criterion = criterion + self.random_state = random_state + + if task == 'classification': + self.model = DecisionTreeClassifier( + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + criterion=criterion, + random_state=random_state + ) + else: + self.model = DecisionTreeRegressor( + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + criterion=criterion if criterion in ['mse', 'mae'] else 'squared_error', + random_state=random_state + ) + + def train(self, X, y, test_size=0.2): + """ + Train the Decision Tree model + + Parameters: + ----------- + X : array-like or DataFrame + Feature matrix + y : array-like or Series + Target variable + test_size : float + Proportion of test set + + Returns: + -------- + dict : Training results including metrics + """ + # Split data + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=self.random_state + ) + + # Train model + self.model.fit(X_train, y_train) + + # Predictions + y_pred = self.model.predict(X_test) + y_train_pred = self.model.predict(X_train) + + # Calculate metrics based on task + if self.task == 'classification': + results = self._classification_metrics( + y_train, y_train_pred, y_test, y_pred, X_test + ) + else: + results = self._regression_metrics( + y_train, y_train_pred, y_test, y_pred + ) + + # Add feature importance + results['feature_importance'] = self.model.feature_importances_ + results['n_nodes'] = self.model.tree_.node_count + results['n_leaves'] = self.model.get_n_leaves() + results['max_depth_achieved'] = self.model.get_depth() + + # Store for later use + self.X_train = X_train + self.X_test = X_test + self.y_train = y_train + self.y_test = y_test + self.y_pred = y_pred + + return results + + def _classification_metrics(self, y_train, y_train_pred, y_test, y_pred, X_test): + """Calculate classification metrics""" + + # Probabilities + y_pred_proba = self.model.predict_proba(X_test)[:, 1] if len(np.unique(y_test)) == 2 else None + + return { + 'train_accuracy': accuracy_score(y_train, y_train_pred), + 'test_accuracy': accuracy_score(y_test, y_pred), + 'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0), + 'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0), + 'f1_score': f1_score(y_test, y_pred, average='weighted', zero_division=0), + 'confusion_matrix': confusion_matrix(y_test, y_pred), + 'classification_report': classification_report(y_test, y_pred, output_dict=True), + 'y_pred': y_pred, + 'y_pred_proba': y_pred_proba, + 'y_test': y_test + } + + def _regression_metrics(self, y_train, y_train_pred, y_test, y_pred): + """Calculate regression metrics""" + return { + 'train_r2': r2_score(y_train, y_train_pred), + 'test_r2': r2_score(y_test, y_pred), + 'mse': mean_squared_error(y_test, y_pred), + 'rmse': np.sqrt(mean_squared_error(y_test, y_pred)), + 'mae': mean_absolute_error(y_test, y_pred), + 'y_pred': y_pred, + 'y_test': y_test + } + + def plot_tree_structure(self, feature_names=None, class_names=None, max_depth_display=3): + """ + Visualize the decision tree structure + + Parameters: + ----------- + feature_names : list + Names of features + class_names : list + Names of classes (for classification) + max_depth_display : int + Maximum depth to display + + Returns: + -------- + matplotlib.figure.Figure + """ + fig, ax = plt.subplots(figsize=(20, 10)) + + plot_tree(self.model, + filled=True, + feature_names=feature_names, + class_names=class_names, + rounded=True, + fontsize=10, + max_depth=max_depth_display, + ax=ax) + + plt.title(f'Decision Tree Structure (Max Depth: {self.max_depth})', + fontsize=16, fontweight='bold', pad=20) + plt.tight_layout() + return fig + + def predict(self, X): + """Make predictions on new data""" + return self.model.predict(X) + + def get_feature_importance(self, feature_names): + """Get feature importance as DataFrame""" + importance_df = pd.DataFrame({ + 'Feature': feature_names, + 'Importance': self.model.feature_importances_ + }).sort_values('Importance', ascending=False) + + return importance_df diff --git a/pages/Decision_Tree.py b/pages/Decision_Tree.py new file mode 100644 index 0000000..cb68324 --- /dev/null +++ b/pages/Decision_Tree.py @@ -0,0 +1,307 @@ +""" +Decision Tree Simulator Page +Author: Akshit +Date: October 13, 2025 +""" + +import streamlit as st +import pandas as pd +import numpy as np +from models.decision_tree import DecisionTreeModel +from utils.plot_helpers import (plot_confusion_matrix, plot_feature_importance, + plot_prediction_distribution, plot_actual_vs_predicted, + plot_residuals) +from sklearn.datasets import load_iris, load_diabetes + +# Page configuration +st.set_page_config(page_title="Decision Tree Simulator", layout="wide", page_icon="๐ŸŒณ") + +# Custom CSS +st.markdown(""" + +""", unsafe_allow_html=True) + +# Header +st.markdown('

๐ŸŒณ Decision Tree Simulator

', unsafe_allow_html=True) + +st.markdown(""" +
+

๐ŸŽฏ About Decision Trees

+

Decision Trees are intuitive models that make predictions by learning simple decision rules + from data features. They're easy to interpret and visualize, making them perfect for understanding + how predictions are made.

+
+""", unsafe_allow_html=True) + +# Sidebar Configuration +st.sidebar.header("โš™๏ธ Model Configuration") + +# Task selection +task = st.sidebar.radio("Select Task:", ["Classification", "Regression"]) + +# Data source +data_source = st.sidebar.radio("Data Source:", ["Upload CSV", "Use Sample Dataset"]) + +df = None + +if data_source == "Upload CSV": + uploaded_file = st.sidebar.file_uploader("Upload CSV", type=['csv']) + if uploaded_file: + df = pd.read_csv(uploaded_file) + st.sidebar.success("โœ… File uploaded!") +else: + if task == "Classification": + data = load_iris() + df = pd.DataFrame(data.data, columns=data.feature_names) + df['target'] = data.target + st.sidebar.info("๐Ÿ“Š Using Iris Dataset (Classification)") + else: + data = load_diabetes() + df = pd.DataFrame(data.data, columns=data.feature_names) + df['target'] = data.target + st.sidebar.info("๐Ÿ“Š Using Diabetes Dataset (Regression)") + +if df is not None: + # Dataset Overview + st.markdown("### ๐Ÿ“ Dataset Overview") + col1, col2, col3 = st.columns(3) + with col1: + st.metric("Rows", df.shape[0]) + with col2: + st.metric("Columns", df.shape[1]) + with col3: + st.metric("Missing Values", df.isnull().sum().sum()) + + with st.expander("๐Ÿ‘€ View Dataset"): + st.dataframe(df.head(10)) + + # Feature and Target Selection + st.markdown("### ๐ŸŽฏ Feature & Target Selection") + + col1, col2 = st.columns([2, 1]) + with col1: + target_column = st.selectbox("Select Target Column:", df.columns) + with col2: + test_size = st.slider("Test Size (%)", 10, 50, 20) / 100 + + available_features = [col for col in df.columns if col != target_column] + selected_features = st.multiselect( + "Select Features:", + available_features, + default=available_features[:min(5, len(available_features))] + ) + + # Model Parameters + st.markdown("### ๐Ÿ”ง Model Parameters") + col1, col2, col3 = st.columns(3) + + with col1: + max_depth = st.slider("Max Depth", 1, 20, 5, + help="Maximum depth of the tree") + with col2: + min_samples_split = st.slider("Min Samples Split", 2, 20, 2, + help="Minimum samples to split a node") + with col3: + min_samples_leaf = st.slider("Min Samples Leaf", 1, 10, 1, + help="Minimum samples in leaf node") + + if task == "Classification": + criterion = st.selectbox("Criterion:", ["gini", "entropy"], + help="Function to measure split quality") + else: + criterion = st.selectbox("Criterion:", ["squared_error", "absolute_error"], + help="Function to measure split quality") + + # Train Model + if len(selected_features) > 0 and st.button("๐Ÿš€ Train Decision Tree"): + with st.spinner('๐ŸŒณ Growing the tree...'): + # Prepare data + X = df[selected_features].fillna(df[selected_features].mean()) + y = df[target_column] + + # Initialize model + dt_model = DecisionTreeModel( + task=task.lower(), + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + criterion=criterion, + random_state=42 + ) + + # Train + results = dt_model.train(X, y, test_size=test_size) + + st.success("โœ… Decision Tree trained successfully!") + + # ==================== RESULTS ==================== + st.markdown("### ๐Ÿ“Š Training Results") + + if task == "Classification": + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.markdown(f""" +
+

Accuracy

+

{results['test_accuracy']:.2%}

+
+ """, unsafe_allow_html=True) + + with col2: + st.markdown(f""" +
+

Precision

+

{results['precision']:.2%}

+
+ """, unsafe_allow_html=True) + + with col3: + st.markdown(f""" +
+

Recall

+

{results['recall']:.2%}

+
+ """, unsafe_allow_html=True) + + with col4: + st.markdown(f""" +
+

F1-Score

+

{results['f1_score']:.2%}

+
+ """, unsafe_allow_html=True) + + # Confusion Matrix + st.markdown("### ๐ŸŽฏ Confusion Matrix") + cm_fig = plot_confusion_matrix(results['y_test'], results['y_pred']) + st.pyplot(cm_fig) + + # Classification Report + st.markdown("### ๐Ÿ“‹ Classification Report") + report_df = pd.DataFrame(results['classification_report']).transpose() + st.dataframe(report_df.style.background_gradient(cmap='Greens')) + + else: + # Regression metrics + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.markdown(f""" +
+

Rยฒ Score

+

{results['test_r2']:.3f}

+
+ """, unsafe_allow_html=True) + + with col2: + st.markdown(f""" +
+

MSE

+

{results['mse']:.2f}

+
+ """, unsafe_allow_html=True) + + with col3: + st.markdown(f""" +
+

RMSE

+

{results['rmse']:.2f}

+
+ """, unsafe_allow_html=True) + + with col4: + st.markdown(f""" +
+

MAE

+

{results['mae']:.2f}

+
+ """, unsafe_allow_html=True) + + # Actual vs Predicted + st.markdown("### ๐Ÿ“ˆ Actual vs Predicted") + avp_fig = plot_actual_vs_predicted(results['y_test'], results['y_pred']) + st.pyplot(avp_fig) + + # Residuals + st.markdown("### ๐Ÿ“‰ Residual Analysis") + res_fig = plot_residuals(results['y_test'], results['y_pred']) + st.pyplot(res_fig) + + # Tree Structure + st.markdown("### ๐ŸŒณ Decision Tree Structure") + st.info("Displaying top 3 levels of the tree for readability") + tree_fig = dt_model.plot_tree_structure( + feature_names=selected_features, + max_depth_display=3 + ) + st.pyplot(tree_fig) + + # Tree Statistics + col1, col2, col3 = st.columns(3) + with col1: + st.metric("Total Nodes", results['n_nodes']) + with col2: + st.metric("Leaf Nodes", results['n_leaves']) + with col3: + st.metric("Tree Depth", results['max_depth_achieved']) + + # Feature Importance + st.markdown("### โญ Feature Importance") + fi_fig = plot_feature_importance(selected_features, results['feature_importance']) + st.pyplot(fi_fig) + + # Feature importance table + importance_df = dt_model.get_feature_importance(selected_features) + st.dataframe(importance_df, use_container_width=True) + +else: + st.info("๐Ÿ‘† Please upload a dataset or select sample dataset to get started!") + + st.markdown(""" + ### ๐Ÿ“‹ How to Use: + 1. Choose task type (Classification or Regression) + 2. Upload CSV or use sample dataset + 3. Select target column and features + 4. Adjust tree parameters (max_depth, min_samples, etc.) + 5. Click **Train Decision Tree** + 6. View results, tree structure, and feature importance + + ### ๐ŸŒŸ Key Parameters: + - **Max Depth**: Limits tree depth to prevent overfitting + - **Min Samples Split**: Minimum samples needed to split a node + - **Min Samples Leaf**: Minimum samples in each leaf node + - **Criterion**: Gini/Entropy (classification) or MSE/MAE (regression) + """) + +# Footer +st.markdown("---") +st.markdown(""" +
+

๐ŸŽƒ Hacktoberfest 2025 | Built by Akshit | Decision Tree Simulator

+
+""", unsafe_allow_html=True) diff --git a/pages/Linear_Regression.py b/pages/Linear_Regression.py index 38e6e94..7e19eed 100644 --- a/pages/Linear_Regression.py +++ b/pages/Linear_Regression.py @@ -1,22 +1,409 @@ +# pages/Logistic_Regression.py import streamlit as st +import pandas as pd import numpy as np -from sklearn.linear_model import LinearRegression -from utils.plot_helpers import plot_regression_line +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score +from sklearn.preprocessing import StandardScaler +import plotly.graph_objects as go +import plotly.express as px +from io import StringIO -st.header("๐Ÿ“ˆ Linear Regression Simulator") +# Page configuration +st.set_page_config(page_title="Logistic Regression Simulator", layout="wide", page_icon="๐Ÿ“Š") -# Sample data -X = np.array([[1], [2], [3], [4], [5]]) -y = np.array([2, 4, 5, 4, 5]) +# Custom CSS for better styling +st.markdown(""" + +""", unsafe_allow_html=True) -# Train model -model = LinearRegression() -model.fit(X, y) +# Header +st.markdown('

๐Ÿ“Š Logistic Regression Simulator

', unsafe_allow_html=True) -# Predict -y_pred = model.predict(X) +st.markdown(""" +
+

๐ŸŽฏ About Logistic Regression

+

Logistic Regression is a statistical method for binary classification that predicts the probability + of an outcome belonging to a particular class. It's widely used in medical diagnosis, credit scoring, + and spam detection.

+
+""", unsafe_allow_html=True) -# Show results -st.subheader("Regression Line") -fig = plot_regression_line(X, y, model) -st.pyplot(fig) +# Sidebar for data input +st.sidebar.header("โš™๏ธ Configuration") +data_source = st.sidebar.radio("Choose Data Source:", ["Upload CSV", "Use Sample Dataset"]) + +df = None + +if data_source == "Upload CSV": + uploaded_file = st.sidebar.file_uploader("Upload your CSV file", type=['csv']) + if uploaded_file is not None: + df = pd.read_csv(uploaded_file) + st.sidebar.success("โœ… File uploaded successfully!") +else: + # Sample dataset (you can use sklearn datasets) + from sklearn.datasets import load_breast_cancer + data = load_breast_cancer() + df = pd.DataFrame(data.data, columns=data.feature_names) + df['target'] = data.target + st.sidebar.info("๐Ÿ“Š Using Breast Cancer Dataset (sample)") + +if df is not None: + # Display dataset info + st.markdown('

๐Ÿ“ Dataset Overview

', unsafe_allow_html=True) + + col1, col2, col3 = st.columns(3) + with col1: + st.metric("Total Rows", df.shape[0]) + with col2: + st.metric("Total Columns", df.shape[1]) + with col3: + st.metric("Missing Values", df.isnull().sum().sum()) + + with st.expander("๐Ÿ‘€ View Dataset"): + st.dataframe(df.head(10), use_container_width=True) + + # Feature selection + st.markdown('

๐ŸŽฏ Model Configuration

', unsafe_allow_html=True) + + col1, col2 = st.columns([2, 1]) + + with col1: + target_column = st.selectbox("Select Target Column (0/1):", df.columns) + + with col2: + test_size = st.slider("Test Size (%)", 10, 50, 20) / 100 + + # Select features + available_features = [col for col in df.columns if col != target_column] + selected_features = st.multiselect( + "Select Features for Training:", + available_features, + default=available_features[:min(5, len(available_features))] + ) + + if len(selected_features) > 0 and st.button("๐Ÿš€ Train Model"): + # Prepare data + X = df[selected_features] + y = df[target_column] + + # Handle missing values + X = X.fillna(X.mean()) + + # Split data + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=42 + ) + + # Scale features + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(X_test) + + # Train model + with st.spinner('๐Ÿ”„ Training model...'): + model = LogisticRegression(max_iter=1000, random_state=42) + model.fit(X_train_scaled, y_train) + + # Predictions + y_pred = model.predict(X_test_scaled) + y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] + + # Store in session state + st.session_state['model'] = model + st.session_state['scaler'] = scaler + st.session_state['features'] = selected_features + + st.success("โœ… Model trained successfully!") + + # ==================== TRAINING RESULTS ==================== + st.markdown('

๐Ÿ“ˆ Training Results

', unsafe_allow_html=True) + + col1, col2, col3, col4 = st.columns(4) + + accuracy = accuracy_score(y_test, y_pred) + + with col1: + st.markdown(f""" +
+

Accuracy

+

{accuracy:.2%}

+
+ """, unsafe_allow_html=True) + + with col2: + st.markdown(f""" +
+

Training Samples

+

{len(X_train)}

+
+ """, unsafe_allow_html=True) + + with col3: + st.markdown(f""" +
+

Test Samples

+

{len(X_test)}

+
+ """, unsafe_allow_html=True) + + with col4: + st.markdown(f""" +
+

Features Used

+

{len(selected_features)}

+
+ """, unsafe_allow_html=True) + + # ==================== PREDICTIONS ==================== + st.markdown('

๐Ÿ”ฎ Predictions

', unsafe_allow_html=True) + + predictions_df = pd.DataFrame({ + 'Actual': y_test.values, + 'Predicted': y_pred, + 'Probability': y_pred_proba + }) + + col1, col2 = st.columns([1, 1]) + + with col1: + st.write("**Sample Predictions:**") + st.dataframe(predictions_df.head(10), use_container_width=True) + + with col2: + # Prediction distribution + fig_pred = px.histogram( + predictions_df, + x='Probability', + color='Actual', + nbins=30, + title='Prediction Probability Distribution', + labels={'Probability': 'Predicted Probability', 'count': 'Frequency'}, + color_discrete_map={0: '#ff7675', 1: '#74b9ff'} + ) + fig_pred.update_layout(height=400) + st.plotly_chart(fig_pred, use_container_width=True) + + # ==================== CONFUSION MATRIX ==================== + st.markdown('

๐ŸŽฏ Confusion Matrix

', unsafe_allow_html=True) + + col1, col2 = st.columns([1, 1]) + + with col1: + # Create confusion matrix + cm = confusion_matrix(y_test, y_pred) + + # Plot using plotly for better interactivity + fig_cm = go.Figure(data=go.Heatmap( + z=cm, + x=['Predicted 0', 'Predicted 1'], + y=['Actual 0', 'Actual 1'], + text=cm, + texttemplate='%{text}', + textfont={"size": 20}, + colorscale='Blues', + showscale=True + )) + + fig_cm.update_layout( + title='Confusion Matrix', + xaxis_title='Predicted Label', + yaxis_title='True Label', + height=400 + ) + + st.plotly_chart(fig_cm, use_container_width=True) + + with col2: + # Classification report + st.write("**Classification Report:**") + report = classification_report(y_test, y_pred, output_dict=True) + report_df = pd.DataFrame(report).transpose() + st.dataframe(report_df.style.background_gradient(cmap='RdYlGn', subset=['precision', 'recall', 'f1-score']), + use_container_width=True) + + # ==================== ROC CURVE ==================== + st.markdown('

๐Ÿ“‰ ROC Curve

', unsafe_allow_html=True) + + # Calculate ROC curve + fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba) + roc_auc = auc(fpr, tpr) + + col1, col2 = st.columns([2, 1]) + + with col1: + # Plot ROC curve + fig_roc = go.Figure() + + fig_roc.add_trace(go.Scatter( + x=fpr, y=tpr, + mode='lines', + name=f'ROC Curve (AUC = {roc_auc:.3f})', + line=dict(color='#0984e3', width=3) + )) + + fig_roc.add_trace(go.Scatter( + x=[0, 1], y=[0, 1], + mode='lines', + name='Random Classifier', + line=dict(color='#d63031', width=2, dash='dash') + )) + + fig_roc.update_layout( + title='Receiver Operating Characteristic (ROC) Curve', + xaxis_title='False Positive Rate', + yaxis_title='True Positive Rate', + height=500, + hovermode='x', + legend=dict(x=0.6, y=0.1) + ) + + fig_roc.update_xaxes(range=[0, 1]) + fig_roc.update_yaxes(range=[0, 1]) + + st.plotly_chart(fig_roc, use_container_width=True) + + with col2: + st.markdown(f""" +
+

AUC Score

+

{roc_auc:.4f}

+
+ """, unsafe_allow_html=True) + + st.markdown(""" +
+

๐Ÿ“š Understanding AUC-ROC

+
    +
  • AUC = 1.0: Perfect classifier
  • +
  • AUC > 0.8: Excellent model
  • +
  • AUC > 0.7: Good model
  • +
  • AUC = 0.5: Random guess
  • +
+
+ """, unsafe_allow_html=True) + + # Feature importance + st.markdown('

โญ Feature Importance

', unsafe_allow_html=True) + + feature_importance = pd.DataFrame({ + 'Feature': selected_features, + 'Coefficient': model.coef_[0] + }).sort_values('Coefficient', key=abs, ascending=False) + + fig_importance = px.bar( + feature_importance, + x='Coefficient', + y='Feature', + orientation='h', + title='Feature Coefficients', + color='Coefficient', + color_continuous_scale='RdBu_r' + ) + fig_importance.update_layout(height=max(300, len(selected_features) * 30)) + st.plotly_chart(fig_importance, use_container_width=True) + +else: + st.info("๐Ÿ‘† Please upload a dataset or select the sample dataset to get started!") + + st.markdown(""" + ### ๐Ÿ“‹ Instructions: + 1. Choose a data source from the sidebar (Upload CSV or use sample dataset) + 2. Select your target column (binary: 0/1) + 3. Choose features for training + 4. Adjust the test size if needed + 5. Click **Train Model** to see results + + ### โœจ Features: + - ๐Ÿ“Š Interactive confusion matrix + - ๐Ÿ“ˆ ROC curve with AUC score + - ๐ŸŽฏ Detailed predictions with probabilities + - โญ Feature importance visualization + - ๐Ÿ“‰ Model performance metrics + """) + +# Footer +st.markdown("---") +st.markdown(""" +
+

๐ŸŽƒ Hacktoberfest Contribution | Built with Streamlit & Scikit-learn

+
+""", unsafe_allow_html=True) +# In pages/Logistic_Regression.py (or any model page) + +import streamlit as st +from utils.plot_helpers import plot_roc_curve, plot_confusion_matrix, plot_feature_importance + +# ... your existing code ... + +# After training the model: +if st.button("๐Ÿš€ Train Model"): + # ... training code ... + + # Get predictions + y_pred = model.predict(X_test) + y_pred_proba = model.predict_proba(X_test)[:, 1] + + # Display ROC Curve using helper + st.subheader("๐Ÿ“ˆ ROC Curve") + roc_fig = plot_roc_curve(y_test, y_pred_proba, title="ROC Curve - Logistic Regression") + st.pyplot(roc_fig) + + # Display Confusion Matrix using helper + st.subheader("๐ŸŽฏ Confusion Matrix") + cm_fig = plot_confusion_matrix(y_test, y_pred, labels=['Class 0', 'Class 1']) + st.pyplot(cm_fig) + + # Display Feature Importance using helper + st.subheader("โญ Feature Importance") + fi_fig = plot_feature_importance(selected_features, model.coef_[0]) + st.pyplot(fi_fig) diff --git a/requirements.txt b/requirements.txt index 14fd68d..67ebda4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,7 @@ scikit-learn matplotlib seaborn numpy +matplotlib==3.7.1 +seaborn==0.12.2 +scikit-learn==1.3.0 +numpy==1.24.3 diff --git a/sue b/sue new file mode 100644 index 0000000..74570f6 --- /dev/null +++ b/sue @@ -0,0 +1,324 @@ + + SSUUMMMMAARRYY OOFF LLEESSSS CCOOMMMMAANNDDSS + + Commands marked with * may be preceded by a number, _N. + Notes in parentheses indicate the behavior if _N is given. + A key preceded by a caret indicates the Ctrl key; thus ^K is ctrl-K. + + h H Display this help. + q :q Q :Q ZZ Exit. + --------------------------------------------------------------------------- + + MMOOVVIINNGG + + e ^E j ^N CR * Forward one line (or _N lines). + y ^Y k ^K ^P * Backward one line (or _N lines). + ESC-j * Forward one file line (or _N file lines). + ESC-k * Backward one file line (or _N file lines). + f ^F ^V SPACE * Forward one window (or _N lines). + b ^B ESC-v * Backward one window (or _N lines). + z * Forward one window (and set window to _N). + w * Backward one window (and set window to _N). + ESC-SPACE * Forward one window, but don't stop at end-of-file. + ESC-b * Backward one window, but don't stop at beginning-of-file. + d ^D * Forward one half-window (and set half-window to _N). + u ^U * Backward one half-window (and set half-window to _N). + ESC-) RightArrow * Right one half screen width (or _N positions). + ESC-( LeftArrow * Left one half screen width (or _N positions). + ESC-} ^RightArrow Right to last column displayed. + ESC-{ ^LeftArrow Left to first column. + F Forward forever; like "tail -f". + ESC-F Like F but stop when search pattern is found. + r ^R ^L Repaint screen. + R Repaint screen, discarding buffered input. + --------------------------------------------------- + Default "window" is the screen height. + Default "half-window" is half of the screen height. + --------------------------------------------------------------------------- + + SSEEAARRCCHHIINNGG + + /_p_a_t_t_e_r_n * Search forward for (_N-th) matching line. + ?_p_a_t_t_e_r_n * Search backward for (_N-th) matching line. + n * Repeat previous search (for _N-th occurrence). + N * Repeat previous search in reverse direction. + ESC-n * Repeat previous search, spanning files. + ESC-N * Repeat previous search, reverse dir. & spanning files. + ^O^N ^On * Search forward for (_N-th) OSC8 hyperlink. + ^O^P ^Op * Search backward for (_N-th) OSC8 hyperlink. + ^O^L ^Ol Jump to the currently selected OSC8 hyperlink. + ESC-u Undo (toggle) search highlighting. + ESC-U Clear search highlighting. + &_p_a_t_t_e_r_n * Display only matching lines. + --------------------------------------------------- + Search is case-sensitive unless changed with -i or -I. + A search pattern may begin with one or more of: + ^N or ! Search for NON-matching lines. + ^E or * Search multiple files (pass thru END OF FILE). + ^F or @ Start search at FIRST file (for /) or last file (for ?). + ^K Highlight matches, but don't move (KEEP position). + ^R Don't use REGULAR EXPRESSIONS. + ^S _n Search for match in _n-th parenthesized subpattern. + ^W WRAP search if no match found. + ^L Enter next character literally into pattern. + --------------------------------------------------------------------------- + + JJUUMMPPIINNGG + + g < ESC-< * Go to first line in file (or line _N). + G > ESC-> * Go to last line in file (or line _N). + p % * Go to beginning of file (or _N percent into file). + t * Go to the (_N-th) next tag. + T * Go to the (_N-th) previous tag. + { ( [ * Find close bracket } ) ]. + } ) ] * Find open bracket { ( [. + ESC-^F _<_c_1_> _<_c_2_> * Find close bracket _<_c_2_>. + ESC-^B _<_c_1_> _<_c_2_> * Find open bracket _<_c_1_>. + --------------------------------------------------- + Each "find close bracket" command goes forward to the close bracket + matching the (_N-th) open bracket in the top line. + Each "find open bracket" command goes backward to the open bracket + matching the (_N-th) close bracket in the bottom line. + + m_<_l_e_t_t_e_r_> Mark the current top line with . + M_<_l_e_t_t_e_r_> Mark the current bottom line with . + '_<_l_e_t_t_e_r_> Go to a previously marked position. + '' Go to the previous position. + ^X^X Same as '. + ESC-m_<_l_e_t_t_e_r_> Clear a mark. + --------------------------------------------------- + A mark is any upper-case or lower-case letter. + Certain marks are predefined: + ^ means beginning of the file + $ means end of the file + --------------------------------------------------------------------------- + + CCHHAANNGGIINNGG FFIILLEESS + + :e [_f_i_l_e] Examine a new file. + ^X^V Same as :e. + :n * Examine the (_N-th) next file from the command line. + :p * Examine the (_N-th) previous file from the command line. + :x * Examine the first (or _N-th) file from the command line. + ^O^O Open the currently selected OSC8 hyperlink. + :d Delete the current file from the command line list. + = ^G :f Print current file name. + --------------------------------------------------------------------------- + + MMIISSCCEELLLLAANNEEOOUUSS CCOOMMMMAANNDDSS + + -_<_f_l_a_g_> Toggle a command line option [see OPTIONS below]. + --_<_n_a_m_e_> Toggle a command line option, by name. + __<_f_l_a_g_> Display the setting of a command line option. + ___<_n_a_m_e_> Display the setting of an option, by name. + +_c_m_d Execute the less cmd each time a new file is examined. + + !_c_o_m_m_a_n_d Execute the shell command with $SHELL. + #_c_o_m_m_a_n_d Execute the shell command, expanded like a prompt. + |XX_c_o_m_m_a_n_d Pipe file between current pos & mark XX to shell command. + s _f_i_l_e Save input to a file. + v Edit the current file with $VISUAL or $EDITOR. + V Print version number of "less". + --------------------------------------------------------------------------- + + OOPPTTIIOONNSS + + Most options may be changed either on the command line, + or from within less by using the - or -- command. + Options may be given in one of two forms: either a single + character preceded by a -, or a name preceded by --. + + -? ........ --help + Display help (from command line). + -a ........ --search-skip-screen + Search skips current screen. + -A ........ --SEARCH-SKIP-SCREEN + Search starts just after target line. + -b [_N] .... --buffers=[_N] + Number of buffers. + -B ........ --auto-buffers + Don't automatically allocate buffers for pipes. + -c ........ --clear-screen + Repaint by clearing rather than scrolling. + -d ........ --dumb + Dumb terminal. + -D xx_c_o_l_o_r . --color=xx_c_o_l_o_r + Set screen colors. + -e -E .... --quit-at-eof --QUIT-AT-EOF + Quit at end of file. + -f ........ --force + Force open non-regular files. + -F ........ --quit-if-one-screen + Quit if entire file fits on first screen. + -g ........ --hilite-search + Highlight only last match for searches. + -G ........ --HILITE-SEARCH + Don't highlight any matches for searches. + -h [_N] .... --max-back-scroll=[_N] + Backward scroll limit. + -i ........ --ignore-case + Ignore case in searches that do not contain uppercase. + -I ........ --IGNORE-CASE + Ignore case in all searches. + -j [_N] .... --jump-target=[_N] + Screen position of target lines. + -J ........ --status-column + Display a status column at left edge of screen. + -k _f_i_l_e ... --lesskey-file=_f_i_l_e + Use a compiled lesskey file. + -K ........ --quit-on-intr + Exit less in response to ctrl-C. + -L ........ --no-lessopen + Ignore the LESSOPEN environment variable. + -m -M .... --long-prompt --LONG-PROMPT + Set prompt style. + -n ......... --line-numbers + Suppress line numbers in prompts and messages. + -N ......... --LINE-NUMBERS + Display line number at start of each line. + -o [_f_i_l_e] .. --log-file=[_f_i_l_e] + Copy to log file (standard input only). + -O [_f_i_l_e] .. --LOG-FILE=[_f_i_l_e] + Copy to log file (unconditionally overwrite). + -p _p_a_t_t_e_r_n . --pattern=[_p_a_t_t_e_r_n] + Start at pattern (from command line). + -P [_p_r_o_m_p_t] --prompt=[_p_r_o_m_p_t] + Define new prompt. + -q -Q .... --quiet --QUIET --silent --SILENT + Quiet the terminal bell. + -r -R .... --raw-control-chars --RAW-CONTROL-CHARS + Output "raw" control characters. + -s ........ --squeeze-blank-lines + Squeeze multiple blank lines. + -S ........ --chop-long-lines + Chop (truncate) long lines rather than wrapping. + -t _t_a_g .... --tag=[_t_a_g] + Find a tag. + -T [_t_a_g_s_f_i_l_e] --tag-file=[_t_a_g_s_f_i_l_e] + Use an alternate tags file. + -u -U .... --underline-special --UNDERLINE-SPECIAL + Change handling of backspaces, tabs and carriage returns. + -V ........ --version + Display the version number of "less". + -w ........ --hilite-unread + Highlight first new line after forward-screen. + -W ........ --HILITE-UNREAD + Highlight first new line after any forward movement. + -x [_N[,...]] --tabs=[_N[,...]] + Set tab stops. + -X ........ --no-init + Don't use termcap init/deinit strings. + -y [_N] .... --max-forw-scroll=[_N] + Forward scroll limit. + -z [_N] .... --window=[_N] + Set size of window. + -" [_c[_c]] . --quotes=[_c[_c]] + Set shell quote characters. + -~ ........ --tilde + Don't display tildes after end of file. + -# [_N] .... --shift=[_N] + Set horizontal scroll amount (0 = one half screen width). + + --exit-follow-on-close + Exit F command on a pipe when writer closes pipe. + --file-size + Automatically determine the size of the input file. + --follow-name + The F command changes files if the input file is renamed. + --form-feed + Stop scrolling when a form feed character is reached. + --header=[_L[,_C[,_N]]] + Use _L lines (starting at line _N) and _C columns as headers. + --incsearch + Search file as each pattern character is typed in. + --intr=[_C] + Use _C instead of ^X to interrupt a read. + --lesskey-context=_t_e_x_t + Use lesskey source file contents. + --lesskey-src=_f_i_l_e + Use a lesskey source file. + --line-num-width=[_N] + Set the width of the -N line number field to _N characters. + --match-shift=[_N] + Show at least _N characters to the left of a search match. + --modelines=[_N] + Read _N lines from the input file and look for vim modelines. + --mouse + Enable mouse input. + --no-edit-warn + Don't warn when using v command on a file opened via LESSOPEN. + --no-keypad + Don't send termcap keypad init/deinit strings. + --no-histdups + Remove duplicates from command history. + --no-number-headers + Don't give line numbers to header lines. + --no-paste + Ignore pasted input. + --no-search-header-lines + Searches do not include header lines. + --no-search-header-columns + Searches do not include header columns. + --no-search-headers + Searches do not include header lines or columns. + --no-vbell + Disable the terminal's visual bell. + --redraw-on-quit + Redraw final screen when quitting. + --rscroll=[_C] + Set the character used to mark truncated lines. + --save-marks + Retain marks across invocations of less. + --search-options=[EFKNRW-] + Set default options for every search. + --show-preproc-errors + Display a message if preprocessor exits with an error status. + --proc-backspace + Process backspaces for bold/underline. + --PROC-BACKSPACE + Treat backspaces as control characters. + --proc-return + Delete carriage returns before newline. + --PROC-RETURN + Treat carriage returns as control characters. + --proc-tab + Expand tabs to spaces. + --PROC-TAB + Treat tabs as control characters. + --status-col-width=[_N] + Set the width of the -J status column to _N characters. + --status-line + Highlight or color the entire line containing a mark. + --use-backslash + Subsequent options use backslash as escape char. + --use-color + Enables colored text. + --wheel-lines=[_N] + Each click of the mouse wheel moves _N lines. + --wordwrap + Wrap lines at spaces. + + + --------------------------------------------------------------------------- + + LLIINNEE EEDDIITTIINNGG + + These keys can be used to edit text being entered + on the "command line" at the bottom of the screen. + + RightArrow ..................... ESC-l ... Move cursor right one character. + LeftArrow ...................... ESC-h ... Move cursor left one character. + ctrl-RightArrow ESC-RightArrow ESC-w ... Move cursor right one word. + ctrl-LeftArrow ESC-LeftArrow ESC-b ... Move cursor left one word. + HOME ........................... ESC-0 ... Move cursor to start of line. + END ............................ ESC-$ ... Move cursor to end of line. + BACKSPACE ................................ Delete char to left of cursor. + DELETE ......................... ESC-x ... Delete char under cursor. + ctrl-BACKSPACE ESC-BACKSPACE ........... Delete word to left of cursor. + ctrl-DELETE .... ESC-DELETE .... ESC-X ... Delete word under cursor. + ctrl-U ......... ESC (MS-DOS only) ....... Delete entire line. + UpArrow ........................ ESC-k ... Retrieve previous command line. + DownArrow ...................... ESC-j ... Retrieve next command line. + TAB ...................................... Complete filename & cycle. + SHIFT-TAB ...................... ESC-TAB Complete filename & reverse cycle. + ctrl-L ................................... Complete filename, list all. diff --git a/tatus b/tatus new file mode 100644 index 0000000..2a54845 --- /dev/null +++ b/tatus @@ -0,0 +1,30 @@ + + SSUUMMMMAARRYY OOFF LLEESSSS CCOOMMMMAANNDDSS + + Commands marked with * may be preceded by a number, _N. + Notes in parentheses indicate the behavior if _N is given. + A key preceded by a caret indicates the Ctrl key; thus ^K is ctrl-K. + + h H Display this help. + q :q Q :Q ZZ Exit. + --------------------------------------------------------------------------- + + MMOOVVIINNGG + + e ^E j ^N CR * Forward one line (or _N lines). + y ^Y k ^K ^P * Backward one line (or _N lines). + ESC-j * Forward one file line (or _N file lines). + ESC-k * Backward one file line (or _N file lines). + f ^F ^V SPACE * Forward one window (or _N lines). + b ^B ESC-v * Backward one window (or _N lines). + z * Forward one window (and set window to _N). + w * Backward one window (and set window to _N). + ESC-SPACE * Forward one window, but don't stop at end-of-file. + ESC-b * Backward one window, but don't stop at beginning-of-file. + d ^D * Forward one half-window (and set half-window to _N). + u ^U * Backward one half-window (and set half-window to _N). + ESC-) RightArrow * Right one half screen width (or _N positions). + ESC-( LeftArrow * Left one half screen width (or _N positions). + ESC-} ^RightArrow Right to last column displayed. + ESC-{ ^LeftArrow Left to first column. + F Forward forever; like "tail -f". diff --git a/utils/data_helpers.py b/utils/data_helpers.py index 80c10f4..6d3f705 100644 --- a/utils/data_helpers.py +++ b/utils/data_helpers.py @@ -1,30 +1,410 @@ -# utils/data_helpers.py +""" +Data Helper Functions for ML Simulator +Author: Akshit +Date: October 13, 2025 +Purpose: Generate and load sample datasets for ML training +""" -from sklearn.datasets import make_regression +import numpy as np import pandas as pd +from sklearn.datasets import (make_regression, make_classification, + load_iris, load_diabetes, load_breast_cancer, + load_wine, load_boston) +from sklearn.preprocessing import StandardScaler -def generate_sample_regression(n_samples=100, n_features=1, noise=0.0, random_state=None): +def generate_regression_dataset(n_samples=1000, n_features=10, noise=10.0, + n_informative=5, random_state=42, + include_bias=True, return_dataframe=True): """ - Generate a sample regression dataset. + Generate synthetic regression dataset for training and testing + + Parameters: + ----------- + n_samples : int, default=1000 + Number of samples to generate + n_features : int, default=10 + Total number of features + noise : float, default=10.0 + Standard deviation of Gaussian noise added to output + Higher values = more noise, harder to predict + n_informative : int, default=5 + Number of informative features + Remaining features will be noise + random_state : int, default=42 + Random seed for reproducibility + include_bias : bool, default=True + If True, adds a bias/intercept term + return_dataframe : bool, default=True + If True, returns pandas DataFrame, else numpy arrays + + Returns: + -------- + X : DataFrame or ndarray + Feature matrix (n_samples, n_features) + y : Series or ndarray + Target variable (n_samples,) + + Examples: + --------- + >>> # Generate basic dataset + >>> X, y = generate_regression_dataset(n_samples=500, n_features=5) + >>> print(X.shape, y.shape) + (500, 5) (500,) + + >>> # Generate noisy dataset + >>> X, y = generate_regression_dataset(n_samples=1000, noise=50.0) + + >>> # Generate dataset with many features + >>> X, y = generate_regression_dataset(n_features=20, n_informative=10) + """ + + # Validate parameters + if n_informative > n_features: + raise ValueError(f"n_informative ({n_informative}) cannot be greater than n_features ({n_features})") + + if n_samples < 10: + raise ValueError(f"n_samples must be at least 10, got {n_samples}") + + # Generate regression data + X, y, coef = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + noise=noise, + bias=0.0 if not include_bias else 100.0, + random_state=random_state, + coef=True + ) + + if return_dataframe: + # Create DataFrame with meaningful column names + feature_names = [f'Feature_{i+1}' for i in range(n_features)] + X_df = pd.DataFrame(X, columns=feature_names) + y_series = pd.Series(y, name='target') + + # Add metadata + X_df.attrs['n_informative'] = n_informative + X_df.attrs['noise_level'] = noise + X_df.attrs['true_coefficients'] = coef + + return X_df, y_series + else: + return X, y + +def generate_regression_dataset_with_nonlinearity(n_samples=1000, n_features=5, + noise=10.0, degree=2, + random_state=42): + """ + Generate regression dataset with non-linear relationships + Parameters: - n_samples (int): Number of data points. - n_features (int): Number of features. - noise (float): Standard deviation of Gaussian noise added to the output. - random_state (int or None): Random seed for reproducibility. + ----------- + n_samples : int + Number of samples + n_features : int + Number of base features + noise : float + Noise level + degree : int + Polynomial degree (1=linear, 2=quadratic, etc.) + random_state : int + Random seed + + Returns: + -------- + X : DataFrame + Feature matrix + y : Series + Target variable with non-linear relationships + """ + np.random.seed(random_state) + + # Generate base features + X = np.random.randn(n_samples, n_features) + + # Create non-linear target + y = np.zeros(n_samples) + for i in range(n_features): + coef = np.random.randn() + y += coef * (X[:, i] ** degree) + + # Add noise + y += np.random.normal(0, noise, n_samples) + + # Convert to DataFrame + feature_names = [f'Feature_{i+1}' for i in range(n_features)] + X_df = pd.DataFrame(X, columns=feature_names) + y_series = pd.Series(y, name='target') + + return X_df, y_series + +def generate_regression_dataset_with_outliers(n_samples=1000, n_features=10, + outlier_fraction=0.1, noise=10.0, + random_state=42): + """ + Generate regression dataset with outliers + + Parameters: + ----------- + n_samples : int + Number of samples + n_features : int + Number of features + outlier_fraction : float + Fraction of outliers (0.0 to 1.0) + noise : float + Base noise level + random_state : int + Random seed + Returns: - X (pd.DataFrame): Feature dataframe of shape (n_samples, n_features) - y (pd.Series): Target variable of shape (n_samples,) + -------- + X : DataFrame + Feature matrix + y : Series + Target variable with outliers + is_outlier : Series + Boolean mask indicating outliers """ - X, y = make_regression( + X, y = generate_regression_dataset( n_samples=n_samples, n_features=n_features, noise=noise, random_state=random_state ) - # Convert to pandas for convenience - X_df = pd.DataFrame(X, columns=[f'feature_{i+1}' for i in range(n_features)]) + + # Add outliers + np.random.seed(random_state) + n_outliers = int(n_samples * outlier_fraction) + outlier_indices = np.random.choice(n_samples, n_outliers, replace=False) + + # Make outliers deviate significantly + y_mean = y.mean() + y_std = y.std() + y.iloc[outlier_indices] += np.random.choice([-1, 1], n_outliers) * (3 * y_std) + + # Create outlier mask + is_outlier = pd.Series(False, index=range(n_samples), name='is_outlier') + is_outlier.iloc[outlier_indices] = True + + return X, y, is_outlier + + +def generate_time_series_regression(n_samples=1000, trend='linear', + seasonality=True, noise=5.0, + random_state=42): + """ + Generate time series regression dataset + + Parameters: + ----------- + n_samples : int + Number of time points + trend : str + 'linear', 'quadratic', or 'exponential' + seasonality : bool + Whether to include seasonal component + noise : float + Noise level + random_state : int + Random seed + + Returns: + -------- + X : DataFrame + Features including time, lagged values, etc. + y : Series + Target time series + """ + np.random.seed(random_state) + + # Time index + t = np.arange(n_samples) + + # Trend component + if trend == 'linear': + trend_component = 0.5 * t + elif trend == 'quadratic': + trend_component = 0.001 * (t ** 2) + elif trend == 'exponential': + trend_component = np.exp(0.001 * t) + else: + trend_component = np.zeros(n_samples) + + # Seasonal component + if seasonality: + seasonal_component = 10 * np.sin(2 * np.pi * t / 50) + else: + seasonal_component = np.zeros(n_samples) + + # Combine and add noise + y = trend_component + seasonal_component + np.random.normal(0, noise, n_samples) + + # Create features + X_data = { + 'time': t, + 'trend': trend_component, + 'sin_component': np.sin(2 * np.pi * t / 50), + 'cos_component': np.cos(2 * np.pi * t / 50), + } + + # Add lagged features + y_series = pd.Series(y) + for lag in [1, 2, 5, 10]: + X_data[f'lag_{lag}'] = y_series.shift(lag).fillna(0).values + + X_df = pd.DataFrame(X_data) y_series = pd.Series(y, name='target') return X_df, y_series + + +def load_sample_regression_datasets(): + """ + Load built-in regression datasets from sklearn + + Returns: + -------- + dict : Dictionary of datasets with metadata + """ + datasets = {} + + # Diabetes dataset + diabetes = load_diabetes() + datasets['diabetes'] = { + 'X': pd.DataFrame(diabetes.data, columns=diabetes.feature_names), + 'y': pd.Series(diabetes.target, name='progression'), + 'description': 'Diabetes progression prediction (442 samples, 10 features)', + 'task': 'regression', + 'n_samples': 442, + 'n_features': 10 + } + + # California Housing (if available) + try: + from sklearn.datasets import fetch_california_housing + housing = fetch_california_housing() + datasets['california_housing'] = { + 'X': pd.DataFrame(housing.data, columns=housing.feature_names), + 'y': pd.Series(housing.target, name='median_house_value'), + 'description': 'California housing prices (20640 samples, 8 features)', + 'task': 'regression', + 'n_samples': 20640, + 'n_features': 8 + } + except: + pass + + return datasets + + +def get_dataset_by_name(name='diabetes', scaled=False): + """ + Get dataset by name with optional scaling + + Parameters: + ----------- + name : str + Dataset name: 'diabetes', 'california_housing', 'generated' + scaled : bool + Whether to scale features + + Returns: + -------- + X : DataFrame + Features + y : Series + Target + """ + if name == 'generated': + X, y = generate_regression_dataset() + elif name == 'diabetes': + data = load_diabetes() + X = pd.DataFrame(data.data, columns=data.feature_names) + y = pd.Series(data.target, name='target') + elif name == 'california_housing': + from sklearn.datasets import fetch_california_housing + data = fetch_california_housing() + X = pd.DataFrame(data.data, columns=data.feature_names) + y = pd.Series(data.target, name='target') + else: + raise ValueError(f"Unknown dataset: {name}") + + if scaled: + scaler = StandardScaler() + X_scaled = scaler.fit_transform(X) + X = pd.DataFrame(X_scaled, columns=X.columns) + + return X, y + + +def save_dataset_to_csv(X, y, filename='regression_dataset.csv'): + """ + Save generated dataset to CSV file + + Parameters: + ----------- + X : DataFrame + Features + y : Series + Target + filename : str + Output filename + """ + df = X.copy() + df['target'] = y + df.to_csv(filename, index=False) + print(f"โœ… Dataset saved to {filename}") + return filename + + +# Test function +def test_data_helpers(): + """Test all dataset generation functions""" + print("๐Ÿงช Testing data helper functions...\n") + + # Test 1: Basic regression dataset + print("1. Basic regression dataset:") + X, y = generate_regression_dataset(n_samples=100, n_features=5) + print(f" Shape: X={X.shape}, y={y.shape}") + print(f" Features: {list(X.columns)}") + print(f" Target range: [{y.min():.2f}, {y.max():.2f}]\n") + + # Test 2: Noisy dataset + print("2. Noisy regression dataset:") + X, y = generate_regression_dataset(n_samples=100, noise=50.0) + print(f" Noise level: 50.0") + print(f" Target std: {y.std():.2f}\n") + + # Test 3: Non-linear dataset + print("3. Non-linear regression dataset:") + X, y = generate_regression_dataset_with_nonlinearity(n_samples=100, degree=2) + print(f" Polynomial degree: 2") + print(f" Shape: {X.shape}\n") + + # Test 4: Dataset with outliers + print("4. Dataset with outliers:") + X, y, outliers = generate_regression_dataset_with_outliers(n_samples=100, outlier_fraction=0.1) + print(f" Total outliers: {outliers.sum()}") + print(f" Outlier percentage: {(outliers.sum()/len(outliers))*100:.1f}%\n") + + # Test 5: Time series + print("5. Time series regression:") + X, y = generate_time_series_regression(n_samples=100) + print(f" Shape: {X.shape}") + print(f" Features: {list(X.columns)}\n") + + # Test 6: Load sample datasets + print("6. Sample datasets:") + datasets = load_sample_regression_datasets() + for name, data in datasets.items(): + print(f" - {name}: {data['description']}") + + print("\nโœ… All tests passed!") + + +if __name__ == "__main__": + test_data_helpers() + \ No newline at end of file diff --git a/utils/plot_helper.py b/utils/plot_helper.py new file mode 100644 index 0000000..98a16c2 --- /dev/null +++ b/utils/plot_helper.py @@ -0,0 +1,356 @@ +""" +Plot Helper Functions for ML Simulator +Author: Akshit +Date: October 13, 2025 +Purpose: Utility functions for creating matplotlib/seaborn plots for Streamlit +""" + +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +from sklearn.metrics import roc_curve, auc, confusion_matrix +import io +import base64 + +# Set style +sns.set_style("whitegrid") +plt.rcParams['figure.figsize'] = (10, 6) +plt.rcParams['font.size'] = 10 + +def plot_roc_curve(y_true, y_pred_proba, title="ROC Curve", return_fig=True): + """ + Plot ROC curve with AUC score + + Parameters: + ----------- + y_true : array-like + True binary labels (0 or 1) + y_pred_proba : array-like + Predicted probabilities for positive class + title : str, optional + Plot title (default: "ROC Curve") + return_fig : bool, optional + If True, returns matplotlib figure object + If False, returns base64 encoded image string + + Returns: + -------- + matplotlib.figure.Figure or str + Figure object or base64 encoded PNG string + + Example: + -------- + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_samples=100, n_features=5, random_state=42) + >>> model = LogisticRegression() + >>> model.fit(X, y) + >>> y_pred_proba = model.predict_proba(X)[:, 1] + >>> fig = plot_roc_curve(y, y_pred_proba) + >>> import streamlit as st + >>> st.pyplot(fig) + """ + + # Calculate ROC curve + fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba) + roc_auc = auc(fpr, tpr) + + # Create figure + fig, ax = plt.subplots(figsize=(10, 8)) + + # Plot ROC curve + ax.plot(fpr, tpr, color='#0984e3', linewidth=3, + label=f'ROC Curve (AUC = {roc_auc:.3f})', marker='o', + markersize=4, markevery=20) + + # Plot diagonal (random classifier) + ax.plot([0, 1], [0, 1], color='#d63031', linestyle='--', + linewidth=2, label='Random Classifier (AUC = 0.5)') + + # Fill area under curve + ax.fill_between(fpr, tpr, alpha=0.2, color='#74b9ff') + + # Styling + ax.set_xlim([0.0, 1.0]) + ax.set_ylim([0.0, 1.05]) + ax.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold') + ax.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold') + ax.set_title(title, fontsize=14, fontweight='bold', pad=20) + ax.legend(loc="lower right", fontsize=11, framealpha=0.9) + ax.grid(True, alpha=0.3) + + # Add AUC score annotation + ax.text(0.6, 0.2, f'AUC Score\n{roc_auc:.4f}', + bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8), + fontsize=12, fontweight='bold') + + plt.tight_layout() + + if return_fig: + return fig + else: + # Convert to base64 for embedding + buffer = io.BytesIO() + plt.savefig(buffer, format='png', dpi=150, bbox_inches='tight') + buffer.seek(0) + img_str = base64.b64encode(buffer.read()).decode() + plt.close() + return img_str + + +def plot_confusion_matrix(y_true, y_pred, labels=None, title="Confusion Matrix"): + """ + Plot confusion matrix heatmap + + Parameters: + ----------- + y_true : array-like + True labels + y_pred : array-like + Predicted labels + labels : list, optional + List of label names + title : str, optional + Plot title + + Returns: + -------- + matplotlib.figure.Figure + Figure object for Streamlit display + """ + + cm = confusion_matrix(y_true, y_pred) + + fig, ax = plt.subplots(figsize=(8, 6)) + + # Plot heatmap + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + square=True, linewidths=2, cbar_kws={"shrink": 0.8}, + xticklabels=labels if labels else ['0', '1'], + yticklabels=labels if labels else ['0', '1'], + ax=ax, annot_kws={'size': 14, 'weight': 'bold'}) + + ax.set_xlabel('Predicted Label', fontsize=12, fontweight='bold') + ax.set_ylabel('True Label', fontsize=12, fontweight='bold') + ax.set_title(title, fontsize=14, fontweight='bold', pad=20) + + plt.tight_layout() + return fig + + +def plot_feature_importance(feature_names, importance_scores, top_n=10, title="Feature Importance"): + """ + Plot horizontal bar chart of feature importance + + Parameters: + ----------- + feature_names : list + List of feature names + importance_scores : array-like + Importance scores for each feature + top_n : int, optional + Number of top features to display (default: 10) + title : str, optional + Plot title + + Returns: + -------- + matplotlib.figure.Figure + Figure object for Streamlit display + """ + + # Sort by importance + indices = np.argsort(np.abs(importance_scores))[::-1][:top_n] + top_features = [feature_names[i] for i in indices] + top_scores = importance_scores[indices] + + # Create figure + fig, ax = plt.subplots(figsize=(10, max(6, top_n * 0.5))) + + # Color based on positive/negative + colors = ['#00b894' if score > 0 else '#ff7675' for score in top_scores] + + # Plot horizontal bar chart + bars = ax.barh(range(len(top_features)), top_scores, color=colors, alpha=0.8) + + # Add value labels + for i, (bar, score) in enumerate(zip(bars, top_scores)): + ax.text(score + 0.01 if score > 0 else score - 0.01, i, + f'{score:.3f}', va='center', + ha='left' if score > 0 else 'right', + fontweight='bold', fontsize=10) + + ax.set_yticks(range(len(top_features))) + ax.set_yticklabels(top_features, fontsize=11) + ax.set_xlabel('Importance Score', fontsize=12, fontweight='bold') + ax.set_title(title, fontsize=14, fontweight='bold', pad=20) + ax.axvline(x=0, color='black', linewidth=1, linestyle='-') + ax.grid(axis='x', alpha=0.3) + + plt.tight_layout() + return fig + + +def plot_prediction_distribution(y_pred_proba, y_true, title="Prediction Probability Distribution"): + """ + Plot histogram of prediction probabilities by true class + + Parameters: + ----------- + y_pred_proba : array-like + Predicted probabilities + y_true : array-like + True labels + title : str, optional + Plot title + + Returns: + -------- + matplotlib.figure.Figure + Figure object for Streamlit display + """ + + fig, ax = plt.subplots(figsize=(10, 6)) + + # Separate predictions by true class + proba_class_0 = y_pred_proba[y_true == 0] + proba_class_1 = y_pred_proba[y_true == 1] + + # Plot histograms + ax.hist(proba_class_0, bins=30, alpha=0.6, color='#ff7675', + label='Actual Class 0', edgecolor='black') + ax.hist(proba_class_1, bins=30, alpha=0.6, color='#74b9ff', + label='Actual Class 1', edgecolor='black') + + ax.axvline(x=0.5, color='green', linestyle='--', linewidth=2, + label='Decision Threshold (0.5)') + + ax.set_xlabel('Predicted Probability', fontsize=12, fontweight='bold') + ax.set_ylabel('Frequency', fontsize=12, fontweight='bold') + ax.set_title(title, fontsize=14, fontweight='bold', pad=20) + ax.legend(fontsize=11, framealpha=0.9) + ax.grid(axis='y', alpha=0.3) + + plt.tight_layout() + return fig + + +def plot_residuals(y_true, y_pred, title="Residual Plot"): + """ + Plot residuals for regression models + + Parameters: + ----------- + y_true : array-like + True values + y_pred : array-like + Predicted values + title : str, optional + Plot title + + Returns: + -------- + matplotlib.figure.Figure + Figure object for Streamlit display + """ + + residuals = y_true - y_pred + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) + + # Residual scatter plot + ax1.scatter(y_pred, residuals, alpha=0.6, color='#6c5ce7', edgecolor='black') + ax1.axhline(y=0, color='red', linestyle='--', linewidth=2) + ax1.set_xlabel('Predicted Values', fontsize=12, fontweight='bold') + ax1.set_ylabel('Residuals', fontsize=12, fontweight='bold') + ax1.set_title('Residuals vs Predicted', fontsize=13, fontweight='bold') + ax1.grid(alpha=0.3) + + # Residual distribution + ax2.hist(residuals, bins=30, alpha=0.7, color='#00b894', edgecolor='black') + ax2.axvline(x=0, color='red', linestyle='--', linewidth=2) + ax2.set_xlabel('Residuals', fontsize=12, fontweight='bold') + ax2.set_ylabel('Frequency', fontsize=12, fontweight='bold') + ax2.set_title('Residual Distribution', fontsize=13, fontweight='bold') + ax2.grid(axis='y', alpha=0.3) + + fig.suptitle(title, fontsize=14, fontweight='bold', y=1.02) + plt.tight_layout() + return fig + + +def plot_actual_vs_predicted(y_true, y_pred, title="Actual vs Predicted"): + """ + Plot actual vs predicted values for regression + + Parameters: + ----------- + y_true : array-like + True values + y_pred : array-like + Predicted values + title : str, optional + Plot title + + Returns: + -------- + matplotlib.figure.Figure + Figure object for Streamlit display + """ + + fig, ax = plt.subplots(figsize=(8, 8)) + + # Scatter plot + ax.scatter(y_true, y_pred, alpha=0.6, color='#0984e3', + edgecolor='black', s=50) + + # Perfect prediction line + min_val = min(y_true.min(), y_pred.min()) + max_val = max(y_true.max(), y_pred.max()) + ax.plot([min_val, max_val], [min_val, max_val], + 'r--', linewidth=2, label='Perfect Prediction') + + ax.set_xlabel('Actual Values', fontsize=12, fontweight='bold') + ax.set_ylabel('Predicted Values', fontsize=12, fontweight='bold') + ax.set_title(title, fontsize=14, fontweight='bold', pad=20) + ax.legend(fontsize=11) + ax.grid(alpha=0.3) + ax.set_aspect('equal', adjustable='box') + + plt.tight_layout() + return fig + + +# Example usage function for testing +def test_plot_helpers(): + """Test function to demonstrate usage""" + from sklearn.datasets import make_classification + from sklearn.model_selection import train_test_split + from sklearn.linear_model import LogisticRegression + + # Generate sample data + X, y = make_classification(n_samples=200, n_features=10, random_state=42) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Train model + model = LogisticRegression() + model.fit(X_train, y_train) + + # Predictions + y_pred = model.predict(X_test) + y_pred_proba = model.predict_proba(X_test)[:, 1] + + # Create plots + roc_fig = plot_roc_curve(y_test, y_pred_proba) + cm_fig = plot_confusion_matrix(y_test, y_pred) + fi_fig = plot_feature_importance( + [f'Feature {i}' for i in range(10)], + model.coef_[0] + ) + + print("โœ… All plot functions working correctly!") + return roc_fig, cm_fig, fi_fig + + +if __name__ == "__main__": + test_plot_helpers()