diff --git a/.gitignore b/.gitignore index b6e4761..4b5eed7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,8 +3,12 @@ __pycache__/ *.py[cod] *$py.class +models/ +data/processed/ # C extensions *.so +venv/ +.venv/ # Distribution / packaging .Python diff --git a/Project.py b/Project.py deleted file mode 100644 index 92d05e2..0000000 --- a/Project.py +++ /dev/null @@ -1,307 +0,0 @@ -import pandas as pd -import seaborn as sb -import matplotlib.pyplot as plt -import time as t -import sklearn.utils as u -import sklearn.preprocessing as pp -import sklearn.tree as tr -import sklearn.ensemble as es -import sklearn.metrics as m -import sklearn.linear_model as lm -import sklearn.neural_network as nn -import numpy as np -#import random as rnd -import warnings as w -w.filterwarnings('ignore') -data = pd.read_csv("AI-Data.csv") -# ---- NEW FEATURE: Correlation Heatmap ---- -plt.figure(figsize=(12, 8)) -sb.heatmap(data.corr(numeric_only=True), annot=True, cmap="coolwarm") -plt.title("Correlation Heatmap") -plt.show() -# ----------------------------------------- -ch = 0 -while(ch != 10): - print("1.Marks Class Count Graph\t2.Marks Class Semester-wise Graph\n3.Marks Class Gender-wise Graph\t4.Marks Class Nationality-wise Graph\n5.Marks Class Grade-wise Graph\t6.Marks Class Section-wise Graph\n7.Marks Class Topic-wise Graph\t8.Marks Class Stage-wise Graph\n9.Marks Class Absent Days-wise\t10.No Graph\n") - ch = int(input("Enter Choice: ")) - if (ch == 1): - print("Loading Graph....\n") - t.sleep(1) - print("\tMarks Class Count Graph") - axes = sb.countplot(x='Class', data=data, order=['L', 'M', 'H']) - plt.show() - elif (ch == 2): - print("Loading Graph....\n") - t.sleep(1) - print("\tMarks Class Semester-wise Graph") - fig, axesarr = plt.subplots(1, figsize=(10, 6)) - sb.countplot(x='Semester', hue='Class', data=data, hue_order=['L', 'M', 'H'], axes=axesarr) - plt.show() - elif (ch == 3): - print("Loading Graph..\n") - t.sleep(1) - print("\tMarks Class Gender-wise Graph") - fig, axesarr = plt.subplots(1, figsize=(10, 6)) - sb.countplot(x='gender', hue='Class', data=data, order=['M', 'F'], hue_order=['L', 'M', 'H'], axes=axesarr) - plt.show() - elif (ch == 4): - print("Loading Graph..\n") - t.sleep(1) - print("\tMarks Class Nationality-wise Graph") - fig, axesarr = plt.subplots(1, figsize=(10, 6)) - sb.countplot(x='NationalITy', hue='Class', data=data, hue_order=['L', 'M', 'H'], axes=axesarr) - plt.show() - elif (ch == 5): - print("Loading Graph: \n") - t.sleep(1) - print("\tMarks Class Grade-wise Graph") - fig, axesarr = plt.subplots(1, figsize=(10, 6)) - sb.countplot(x='GradeID', hue='Class', data=data, order=['G-02', 'G-04', 'G-05', 'G-06', 'G-07', 'G-08', 'G-09', 'G-10', 'G-11', 'G-12'], hue_order = ['L', 'M', 'H'], axes=axesarr) - plt.show() - elif (ch ==6): - print("Loading Graph..\n") - t.sleep(1) - print("\tMarks Class Section-wise Graph") - fig, axesarr = plt.subplots(1, figsize=(10, 6)) - sb.countplot(x='SectionID', hue='Class', data=data, hue_order = ['L', 'M', 'H'], axes=axesarr) - plt.show() - elif (ch == 7): - print("Loading Graph..\n") - t.sleep(1) - print("\tMarks Class Topic-wise Graph") - fig, axesarr = plt.subplots(1, figsize=(10, 6)) - sb.countplot(x='Topic', hue='Class', data=data, hue_order = ['L', 'M', 'H'], axes=axesarr) - plt.show() - elif (ch == 8): - print("Loading Graph..\n") - t.sleep(1) - print("\tMarks Class Stage-wise Graph") - fig, axesarr = plt.subplots(1, figsize=(10, 6)) - sb.countplot(x='StageID', hue='Class', data=data, hue_order = ['L', 'M', 'H'], axes=axesarr) - plt.show() - elif (ch == 9): - print("Loading Graph..\n") - t.sleep(1) - print("\tMarks Class Absent Days-wise Graph") - fig, axesarr = plt.subplots(1, figsize=(10, 6)) - sb.countplot(x='StudentAbsenceDays', hue='Class', data=data, hue_order = ['L', 'M', 'H'], axes=axesarr) - plt.show() -if(ch == 10): - print("Exiting..\n") - t.sleep(1) -#cor = data.corr() -#print(cor) -data = data.drop("gender", axis=1) -data = data.drop("StageID", axis=1) -data = data.drop("GradeID", axis=1) -data = data.drop("NationalITy", axis=1) -data = data.drop("PlaceofBirth", axis=1) -data = data.drop("SectionID", axis=1) -data = data.drop("Topic", axis=1) -data = data.drop("Semester", axis=1) -data = data.drop("Relation", axis=1) -data = data.drop("ParentschoolSatisfaction", axis=1) -data = data.drop("ParentAnsweringSurvey", axis=1) -#data = data.drop("VisITedResources", axis=1) -data = data.drop("AnnouncementsView", axis=1) -u.shuffle(data) -countD = 0 -countP = 0 -countL = 0 -countR = 0 -countN = 0 -gradeID_dict = {"G-01" : 1, - "G-02" : 2, - "G-03" : 3, - "G-04" : 4, - "G-05" : 5, - "G-06" : 6, - "G-07" : 7, - "G-08" : 8, - "G-09" : 9, - "G-10" : 10, - "G-11" : 11, - "G-12" : 12} -data = data.replace({"GradeID" : gradeID_dict}) -#sig = [] -for column in data.columns: - if data[column].dtype == type(object): - le = pp.LabelEncoder() - data[column] = le.fit_transform(data[column]) -ind = int(len(data) * 0.70) -feats = data.values[:, 0:4] -lbls = data.values[:,4] -feats_Train = feats[0:ind] -feats_Test = feats[(ind+1):len(feats)] -lbls_Train = lbls[0:ind] -lbls_Test = lbls[(ind+1):len(lbls)] -modelD = tr.DecisionTreeClassifier() -modelD.fit(feats_Train, lbls_Train) -lbls_predD = modelD.predict(feats_Test) -for a,b in zip(lbls_Test, lbls_predD): - if(a==b): - countD += 1 -accD = (countD/len(lbls_Test)) -print("\nAccuracy measures using Decision Tree:") -print(m.classification_report(lbls_Test, lbls_predD),"\n") -print("\nAccuracy using Decision Tree: ", str(round(accD, 3))) -t.sleep(1) -modelR = es.RandomForestClassifier() -modelR.fit(feats_Train, lbls_Train) -lbls_predR = modelR.predict(feats_Test) -for a,b in zip(lbls_Test, lbls_predR): - if(a==b): - countR += 1 -print("\nAccuracy Measures for Random Forest Classifier: \n") -#print("\nConfusion Matrix: \n", m.confusion_matrix(lbls_Test, lbls_predR)) -print("\n", m.classification_report(lbls_Test,lbls_predR)) -accR = countR/len(lbls_Test) -print("\nAccuracy using Random Forest: ", str(round(accR, 3))) -t.sleep(1) -modelP = lm.Perceptron() -modelP.fit(feats_Train, lbls_Train) -lbls_predP = modelP.predict(feats_Test) -for a,b in zip(lbls_Test, lbls_predP): - if a == b: - countP += 1 -accP = countP/len(lbls_Test) -print("\nAccuracy measures using Linear Model Perceptron:") -print(m.classification_report(lbls_Test, lbls_predP),"\n") -print("\nAccuracy using Linear Model Perceptron: ", str(round(accP, 3)), "\n") -t.sleep(1) -modelL = lm.LogisticRegression() -modelL.fit(feats_Train, lbls_Train) -lbls_predL = modelL.predict(feats_Test) -for a,b in zip(lbls_Test, lbls_predL): - if a == b: - countL += 1 -accL = countL/len(lbls_Test) -print("\nAccuracy measures using Linear Model Logistic Regression:") -print(m.classification_report(lbls_Test, lbls_predL),"\n") -print("\nAccuracy using Linear Model Logistic Regression: ", str(round(accP, 3)), "\n") -t.sleep(1) -modelN = nn.MLPClassifier(activation="logistic") -modelN.fit(feats_Train, lbls_Train) -lbls_predN = modelN.predict(feats_Test) -for a,b in zip(lbls_Test, lbls_predN): - #sig.append(1/(1+ np.exp(-b))) - if a==b: - countN += 1 -#print("\nAverage value of Sigmoid Function: ", str(round(np.average(sig), 3))) -print("\nAccuracy measures using MLP Classifier:") -print(m.classification_report(lbls_Test, lbls_predN),"\n") -accN = countN/len(lbls_Test) -print("\nAccuracy using Neural Network MLP Classifier: ", str(round(accN, 3)), "\n") -choice = input("Do you want to test specific input (y or n): ") -if(choice.lower()=="y"): - gen = input("Enter Gender (M or F): ") - if (gen.upper() == "M"): - gen = 1 - elif (gen.upper() == "F"): - gen = 0 - nat = input("Enter Nationality: ") - pob = input("Place of Birth: ") - gra = input("Grade ID as (G-): ") - if(gra == "G-02"): - gra = 2 - elif (gra == "G-04"): - gra = 4 - elif (gra == "G-05"): - gra = 5 - elif (gra == "G-06"): - gra = 6 - elif (gra == "G-07"): - gra = 7 - elif (gra == "G-08"): - gra = 8 - elif (gra == "G-09"): - gra = 9 - elif (gra == "G-10"): - gra = 10 - elif (gra == "G-11"): - gra = 11 - elif (gra == "G-12"): - gra = 12 - sec = input("Enter Section: ") - top = input("Enter Topic: ") - sem = input("Enter Semester (F or S): ") - if (sem.upper() == "F"): - sem = 0 - elif (sem.upper() == "S"): - sem = 1 - rel = input("Enter Relation (Father or Mum): ") - if (rel == "Father"): - rel = 0 - elif (rel == "Mum"): - rel = 1 - rai = int(input("Enter raised hands: ")) - res = int(input("Enter Visited Resources: ")) - ann = int(input("Enter announcements viewed: ")) - dis = int(input("Enter no. of Discussions: ")) - sur = input("Enter Parent Answered Survey (Y or N): ") - if (sur.upper() == "Y"): - sur = 1 - elif (sur.upper() == "N"): - sur = 0 - sat = input("Enter Parent School Satisfaction (Good or Bad): ") - if (sat == "Good"): - sat = 1 - elif (sat == "Bad"): - sat = 0 - absc = input("Enter No. of Abscenes(Under-7 or Above-7): ") - if (absc == "Under-7"): - absc = 1 - elif (absc == "Above-7"): - absc = 0 - arr = np.array([rai, res, dis, absc]) - #arr = np.array([gen, rnd.randint(0, 30), rnd.randint(0, 30), sta, gra, rnd.randint(0, 30), rnd.randint(0, 30), sem, rel, rai, res, ann, dis, sur, sat, absc]) - predD = modelD.predict(arr.reshape(1, -1)) - predR = modelR.predict(arr.reshape(1, -1)) - predP = modelP.predict(arr.reshape(1, -1)) - predL = modelL.predict(arr.reshape(1, -1)) - predN = modelN.predict(arr.reshape(1, -1)) - if (predD == 0): - predD = "H" - elif (predD == 1): - predD = "M" - elif (predD == 2): - predD = "L" - if (predR == 0): - predR = "H" - elif (predR == 1): - predR = "M" - elif (predR == 2): - predR = "L" - if (predP == 0): - predP = "H" - elif (predP == 1): - predP = "M" - elif (predP == 2): - predP = "L" - if (predL == 0): - predL = "H" - elif (predL == 1): - predL = "M" - elif (predL == 2): - predL = "L" - if (predN == 0): - predN = "H" - elif (predN == 1): - predN = "M" - elif (predN == 2): - predN = "L" - t.sleep(1) - print("\nUsing Decision Tree Classifier: ", predD) - t.sleep(1) - print("Using Random Forest Classifier: ", predR) - t.sleep(1) - print("Using Linear Model Perceptron: ", predP) - t.sleep(1) - print("Using Linear Model Logisitic Regression: ", predL) - t.sleep(1) - print("Using Neural Network MLP Classifier: ", predN) - print("\nExiting...") - t.sleep(1) -else: - print("Exiting..") - t.sleep(1) \ No newline at end of file diff --git a/AI-Data.csv b/data/raw/AI-Data.csv similarity index 100% rename from AI-Data.csv rename to data/raw/AI-Data.csv diff --git a/main.py b/main.py new file mode 100644 index 0000000..dd9ce25 --- /dev/null +++ b/main.py @@ -0,0 +1,43 @@ +from src.load_data import load_data, save_data +from src.preprocessing import prepare_features +from src.train_and_evaluate import train_and_evaluate +from src.visualization import menu_plots +from src.inference import save_model_artifacts, predict_from_input, load_model_artifacts +import os + +DATA_PATH = "data/raw/AI-Data.csv" + + +def main(): + + df = load_data(DATA_PATH) + if df is None: + return + + menu_plots(df) + + print("\nPreparing features...") + X, y, numeric_features, le_target = prepare_features(df, target_col="Class") + print(f"Prepared features: {numeric_features}") + + print("\nTraining and evaluating models...") + results, X_train, X_test, y_train, y_test = train_and_evaluate(X, y) + print("Training complete.\n") + + save_data(df) + + best_name = max(results, key=lambda k: results[k]["test_acc"]) + best_pipeline = results[best_name]["pipeline"] + print(f"Best model: {best_name} (test_acc={results[best_name]['test_acc']:.3f})") + save_model_artifacts(best_pipeline, le_target, model_name=f"{best_name}.joblib") + + if input("\nDo you want to test a custom input interactively? (y/n): ").lower().startswith("y"): + + pipeline, le = load_model_artifacts(f"{best_name}.joblib") + predict_from_input(pipeline, numeric_features, le) + + print("\nAll done.") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index db6a13a..0ec0cdf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,6 @@ seaborn matplotlib scikit-learn numpy +joblib #to run: pip install -r requirements.txt \ No newline at end of file diff --git a/src/inference.py b/src/inference.py new file mode 100644 index 0000000..cacf2d2 --- /dev/null +++ b/src/inference.py @@ -0,0 +1,47 @@ +import os +from joblib import dump, load + +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +MODELS_DIR = os.path.join(BASE_DIR, "models") +os.makedirs(MODELS_DIR, exist_ok=True) + + +def save_model_artifacts(pipeline, label_encoder, model_name="best_model.joblib"): + save_path = os.path.join(MODELS_DIR, model_name) + try: + dump({"pipeline": pipeline, "label_encoder": label_encoder}, save_path) + print(f"Saved model artifacts to: {save_path}") + except Exception as e: + print(f"Error saving model artifacts: {e}") + + +def load_model_artifacts(model_name="best_model.joblib"): + + load_path = os.path.join(MODELS_DIR, model_name) + if not os.path.exists(load_path): + raise FileNotFoundError(f"Model artifact not found: {load_path}") + obj = load(load_path) + return obj["pipeline"], obj["label_encoder"] + + +def predict_from_input(pipeline, numeric_features, le_target): + + import numpy as np + + print("\nEnter values for features (press Enter to use 0):") + vals = [] + for feat in numeric_features: + raw = input(f"{feat}: ").strip() + if raw == "": + raw = "0" + try: + v = float(raw) + except Exception: + print("Invalid input; using 0.") + v = 0.0 + vals.append(v) + + arr = np.array(vals).reshape(1, -1) + pred = pipeline.predict(arr) + label = le_target.inverse_transform(pred) + print(f"\nPredicted Class: {label[0]}") diff --git a/src/load_data.py b/src/load_data.py new file mode 100644 index 0000000..4b0b675 --- /dev/null +++ b/src/load_data.py @@ -0,0 +1,32 @@ +import pandas as pd +import os + +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +PROCESSED_DIR = os.path.join(BASE_DIR, "data", "processed") + + +def load_data(file_path: str): + try: + df = pd.read_csv(file_path) + print(f"Data loaded from: {file_path} (shape={df.shape})") + return df + except FileNotFoundError: + print(f"Error: file not found -> {file_path}") + return None + except pd.errors.ParserError: + print(f"Error: file is not a valid CSV -> {file_path}") + return None + except Exception as e: + print(f"Unexpected error reading data: {e}") + return None + + +def save_data(df, filename="processed_data.csv"): + + os.makedirs(PROCESSED_DIR, exist_ok=True) + save_path = os.path.join(PROCESSED_DIR, filename) + try: + df.to_csv(save_path, index=False) + print(f"Processed data saved to: {save_path}") + except Exception as e: + print(f"Error saving processed data: {e}") diff --git a/src/preprocessing.py b/src/preprocessing.py new file mode 100644 index 0000000..2690fd2 --- /dev/null +++ b/src/preprocessing.py @@ -0,0 +1,45 @@ +import pandas as pd +from sklearn.preprocessing import LabelEncoder +from sklearn.impute import SimpleImputer +import numpy as np + + +def prepare_features(df: pd.DataFrame, target_col: str = "Class"): + data = df.copy() + + grade_map = {f"G-{i:02d}": i for i in range(1, 13)} + if "GradeID" in data.columns: + data["GradeID"] = ( + data["GradeID"] + .astype(str) + .map(grade_map) + .fillna(-1) + ) + + if "StudentAbsenceDays" in data.columns: + data["StudentAbsenceDays"] = data["StudentAbsenceDays"].map({ + "Under-7": 1, + "Above-7": 0 + }).fillna(0) + + preferred = ["raisedhands", "VisITedResources", "Discussion", "StudentAbsenceDays"] + numeric_features = [c for c in preferred if c in data.columns] + + if len(numeric_features) < 4: + numeric_candidates = data.select_dtypes(include=[np.number]).columns.tolist() + numeric_candidates = [c for c in numeric_candidates if c != target_col] + if len(numeric_candidates) < 4: + raise ValueError(f"Not enough numeric features found. Candidates: {numeric_candidates}") + numeric_features = numeric_candidates[:4] + + imputer = SimpleImputer(strategy="median") + data[numeric_features] = imputer.fit_transform(data[numeric_features]) + + if target_col not in data.columns: + raise ValueError(f"Target column '{target_col}' not found in dataframe.") + le_target = LabelEncoder() + y = le_target.fit_transform(data[target_col]) + + X = data[numeric_features].copy() + + return X, y, numeric_features, le_target diff --git a/src/train_and_evaluate.py b/src/train_and_evaluate.py new file mode 100644 index 0000000..7d5b2fc --- /dev/null +++ b/src/train_and_evaluate.py @@ -0,0 +1,78 @@ +import numpy as np +from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import Perceptron, LogisticRegression +from sklearn.neural_network import MLPClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +from sklearn.metrics import accuracy_score, classification_report, confusion_matrix + +RANDOM_STATE = 42 + + +def train_and_evaluate(X, y): + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y + ) + + models = { + "DecisionTree": Pipeline([ + ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE)) + ]), + "RandomForest": Pipeline([ + ("clf", RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)) + ]), + "Perceptron": Pipeline([ + ("scaler", StandardScaler()), + ("clf", Perceptron(max_iter=1000, random_state=RANDOM_STATE)) + ]), + "LogisticRegression": Pipeline([ + ("scaler", StandardScaler()), + ("clf", LogisticRegression( + max_iter=2000, + solver="lbfgs", + random_state=RANDOM_STATE + )) + ]), + "MLP": Pipeline([ + ("scaler", StandardScaler()), + ("clf", MLPClassifier( + hidden_layer_sizes=(64, 32), + activation="relu", + learning_rate_init=0.001, + solver="adam", + max_iter=3000, + random_state=RANDOM_STATE + )) + ]), + } + + results = {} + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE) + + for name, pipe in models.items(): + print(f"\nTraining {name} ...") + cv_scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="accuracy", n_jobs=-1) + pipe.fit(X_train, y_train) + y_pred = pipe.predict(X_test) + acc_test = accuracy_score(y_test, y_pred) + clf_report = classification_report(y_test, y_pred) + conf_mat = confusion_matrix(y_test, y_pred, labels=np.unique(y_train)) + + results[name] = { + "pipeline": pipe, + "cv_mean": float(np.mean(cv_scores)), + "cv_std": float(np.std(cv_scores)), + "test_acc": float(acc_test), + "report": clf_report, + "confusion_matrix": conf_mat + } + + print(f"=== {name} ===") + print(f"CV accuracy: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}") + print(f"Test accuracy: {acc_test:.3f}") + print("Classification Report:\n", clf_report) + + return results, X_train, X_test, y_train, y_test diff --git a/src/visualization.py b/src/visualization.py new file mode 100644 index 0000000..b4f94b7 --- /dev/null +++ b/src/visualization.py @@ -0,0 +1,57 @@ +import seaborn as sb +import matplotlib.pyplot as plt +import time +import numpy as np + + +def plot_correlation(df): + plt.figure(figsize=(10, 8)) + sb.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f") + plt.title("Correlation Heatmap") + plt.tight_layout() + plt.show() + + +def menu_plots(df): + + choices = { + 1: ("Marks Class Count Graph", lambda: sb.countplot(x='Class', data=df, order=['L', 'M', 'H'])), + 2: ("Semester-wise", lambda: sb.countplot(x='Semester', hue='Class', data=df)), + 3: ("Gender-wise", lambda: sb.countplot(x='gender', hue='Class', data=df)), + 4: ("Nationality-wise", lambda: sb.countplot(x='NationalITy', hue='Class', data=df)), + 5: ("Grade-wise", lambda: sb.countplot(x='GradeID', hue='Class', data=df)), + 6: ("Section-wise", lambda: sb.countplot(x='SectionID', hue='Class', data=df)), + 7: ("Topic-wise", lambda: sb.countplot(x='Topic', hue='Class', data=df)), + 8: ("Stage-wise", lambda: sb.countplot(x='StageID', hue='Class', data=df)), + 9: ("Absent Days-wise", lambda: sb.countplot(x='StudentAbsenceDays', hue='Class', data=df)) + } + + while True: + print("\nPlot Menu:") + for k in range(1, 10): + print(f"{k}. {choices[k][0]}") + print("10. Exit plotting") + + try: + ch = int(input("Enter Choice: ")) + except Exception: + print("Invalid input. Enter a number.") + continue + + if ch == 10: + print("Exiting plots menu.") + break + + if ch in choices: + print(f"Loading Graph: {choices[ch][0]}") + time.sleep(0.5) + plt.figure(figsize=(10, 6)) + try: + choices[ch][1]() + plt.title(choices[ch][0]) + plt.tight_layout() + plt.show() + except Exception as e: + print("Plotting failed:", e) + else: + print("Invalid choice.")