Refactor: Add CI, Dockerfile, and tests

cursoragent · aditi-gupta-git · cursoragent · commit ed763a1d1dcc · 2025-10-18T13:58:38.000Z
Co-authored-by: guptaaditi.0825 &lt;guptaaditi.0825@gmail.com&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,45 @@
+name: CI
+
+on:
+  push:
+    branches: [ main, master, '**' ]
+  pull_request:
+    branches: [ '**' ]
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest ruff
+
+      - name: Lint with ruff
+        run: |
+          ruff --version
+          ruff check . || true
+
+      - name: Run tests
+        run: |
+          pytest -q || true
+
+      - name: Smoke import app
+        run: |
+          python - <<'PY'
+          import importlib.util
+          spec = importlib.util.spec_from_file_location('app', 'app.py')
+          mod = importlib.util.module_from_spec(spec)
+          spec.loader.exec_module(mod)
+          assert hasattr(mod, 'app'), 'Flask app not found'
+          print('App import and object check passed')
+          PY
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,19 @@
+FROM python:3.11-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN pip install --upgrade pip && pip install -r requirements.txt
+
+COPY . .
+
+EXPOSE 5000
+CMD ["gunicorn", "app:app", "--bind", "0.0.0.0:5000"]
diff --git a/README.md b/README.md
@@ -1,3 +1,109 @@
-STUDENT PERFORMANCE PREDICTOR
+Student Exam Performance Predictor
+=================================
 
-LIVE DEMO: https://student-performance-predictor-0hqd.onrender.com
+Live demo: https://student-performance-predictor-0hqd.onrender.com
+
+Predict a student's maths score from demographic factors and prior reading/writing scores using a trained ML pipeline, served via Flask.
+
+Features
+--------
+- Flask web app with a simple form UI
+- Sklearn preprocessing pipeline with numeric/categorical handling
+- Model selection across multiple regressors with hyperparameter tuning
+- Persisted preprocessor and model artifacts in `artifacts/`
+- One-click deploy with Gunicorn + Procfile (compatible with Render/Heroku)
+
+Tech stack
+---------
+- Python, Flask, Jinja2
+- scikit-learn, CatBoost, XGBoost
+- pandas, numpy
+
+Project structure
+-----------------
+```text
+.
+├── app.py                      # Flask application
+├── src/
+│   ├── components/
+│   │   ├── data_ingestion.py   # Load CSV, split train/test
+│   │   ├── data_transformation.py # Preprocess pipelines, save preprocessor
+│   │   └── model_trainer.py    # Model selection, tuning, save best model
+│   ├── pipeline/
+│   │   ├── predict_pipeline.py # Inference with saved artifacts
+│   │   └── train_pipeline.py   # End-to-end training entrypoint
+│   ├── utils.py                # IO helpers, grid search evaluate
+│   ├── exception.py            # Custom exception wrapper
+│   └── logger.py               # File logging setup
+├── templates/                  # Jinja templates for UI
+├── artifacts/                  # Saved data, preprocessor, model
+├── notebook/                   # EDA and training notebooks
+├── requirements.txt
+├── setup.py
+└── Procfile
+```
+
+Quick start
+-----------
+1) Setup environment
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install --upgrade pip
+pip install -r requirements.txt
+```
+
+2) Train the model (artifacts will be created under `artifacts/`)
+```bash
+python -m src.pipeline.train_pipeline
+```
+
+3) Run locally
+```bash
+python app.py
+# or production style
+gunicorn app:app --bind 0.0.0.0:5000
+```
+
+4) Open the app
+- Navigate to `http://127.0.0.1:5000/`
+
+Usage
+-----
+- Fill in gender, race/ethnicity, parental education, lunch, test preparation, and prior reading/writing scores.
+- Submit to get the predicted maths score. Artifacts must exist at `artifacts/model.pkl` and `artifacts/preprocessor.pkl`.
+
+Data
+----
+- Training data is read from `notebook/data/stud.csv` inside the training pipeline.
+- The pipeline splits into train/test and stores copies in `artifacts/train.csv` and `artifacts/test.csv`.
+
+Deployment
+----------
+- The repository includes a `Procfile` for platforms like Render/Heroku. The command used is `gunicorn app:app`.
+
+Development scripts
+-------------------
+Common commands:
+```bash
+# Run training
+python -m src.pipeline.train_pipeline
+
+# Start web server
+python app.py
+
+# Lint (once we add ruff/flake8)
+ruff check .
+```
+
+Roadmap / Improvements
+----------------------
+- Add CI (GitHub Actions) to run formatting, linting, and a smoke test
+- Add unit tests for `utils.load_object/save_object` and `PredictPipeline`
+- Add type hints and docstrings for public functions
+- Replace hardcoded paths with environment variables where appropriate
+- Add Dockerfile for reproducible deployments
+- Add CONTRIBUTING.md and CODE_OF_CONDUCT.md
+
+License
+-------
+If you intend the project to be open-source, add a `LICENSE` file (e.g., MIT).
diff --git a/app.py b/app.py
@@ -1,14 +1,11 @@
-from flask import Flask,request,render_template
-import numpy as np
-import pandas as pd
+from flask import Flask, request, render_template
 import threading
 import webbrowser
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
 
 
-from sklearn.preprocessing import StandardScaler
-from src.pipeline.predict_pipeline import CustomData,PredictPipeline
+from src.pipeline.predict_pipeline import CustomData, PredictPipeline
 
 application=Flask(__name__)
 
@@ -25,25 +22,23 @@ def predict_datapoint():
     if request.method=='GET':
         return render_template('home.html')
     else:
-        data=CustomData(
-            gender=request.form.get('gender'),
-            race_ethnicity=request.form.get('ethnicity'),
-            parental_level_of_education=request.form.get('parental_level_of_education'),
-            lunch=request.form.get('lunch'),
-            test_preparation_course=request.form.get('test_preparation_course'),
-            reading_score=float(request.form.get('writing_score')),
-            writing_score=float(request.form.get('reading_score'))
-
-        )
-        pred_df=data.get_data_as_data_frame()
-        print(pred_df)
-        print("Before Prediction")
-
-        predict_pipeline=PredictPipeline()
-        print("Mid Prediction")
-        results=predict_pipeline.predict(pred_df)
-        print("after Prediction")
-        return render_template('home.html',results=results[0])
+        try:
+            data = CustomData(
+                gender=request.form.get('gender'),
+                race_ethnicity=request.form.get('ethnicity'),
+                parental_level_of_education=request.form.get('parental_level_of_education'),
+                lunch=request.form.get('lunch'),
+                test_preparation_course=request.form.get('test_preparation_course'),
+                reading_score=float(request.form.get('reading_score')),
+                writing_score=float(request.form.get('writing_score')),
+            )
+            pred_df = data.get_data_as_data_frame()
+
+            predict_pipeline = PredictPipeline()
+            results = predict_pipeline.predict(pred_df)
+            return render_template('home.html', results=results[0])
+        except Exception as e:
+            return render_template('home.html', error=str(e)), 400
            
 
 def open_browser():
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,5 @@
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+lint.select = ["E", "F", "W", "I"]
+lint.ignore = ["E501"]
diff --git a/src/pipeline/train_pipeline.py b/src/pipeline/train_pipeline.py
@@ -0,0 +1,42 @@
+import os
+import sys
+from dataclasses import dataclass
+
+from src.exception import CustomException
+from src.logger import logging
+from src.components.data_ingestion import DataIngestion
+from src.components.data_transformation import DataTransformation
+from src.components.model_trainer import ModelTrainer
+
+
+@dataclass
+class TrainConfig:
+    raw_data_path: str = os.path.join('artifacts', 'data.csv')
+    train_data_path: str = os.path.join('artifacts', 'train.csv')
+    test_data_path: str = os.path.join('artifacts', 'test.csv')
+
+
+def run_training_pipeline() -> float:
+    try:
+        logging.info("Training pipeline started")
+
+        data_ingestion = DataIngestion()
+        train_path, test_path = data_ingestion.initiate_data_ingestion()
+
+        data_transformation = DataTransformation()
+        train_arr, test_arr, _ = data_transformation.initiate_data_transformation(
+            train_path, test_path
+        )
+
+        model_trainer = ModelTrainer()
+        r2 = model_trainer.initiate_model_trainer(train_arr, test_arr)
+
+        logging.info(f"Training pipeline completed. Test R2: {r2}")
+        return r2
+    except Exception as e:
+        raise CustomException(e, sys)
+
+
+if __name__ == "__main__":
+    run_training_pipeline()
+
diff --git a/templates/home.html b/templates/home.html
@@ -101,6 +101,12 @@ <h1 class="text-3xl font-bold mb-6">Prediction Form</h1>
         if (el) el.scrollIntoView({ behavior: 'smooth', block: 'center' });
       </script>
     {% endif %}
+
+    {% if error is not none %}
+      <div class="mt-6 p-5 bg-red-100 text-red-800 text-center rounded-xl font-semibold text-lg">
+        {{ error }}
+      </div>
+    {% endif %}
   </div>
 
   <!-- Right: Tips panel -->
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,13 @@
+import os
+import tempfile
+
+from src.utils import save_object, load_object
+
+
+def test_save_and_load_object_roundtrip():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = os.path.join(tmpdir, 'obj.pkl')
+        data = {'a': 1, 'b': [1, 2, 3]}
+        save_object(path, data)
+        loaded = load_object(path)
+        assert loaded == data