-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathml_train.py
More file actions
117 lines (105 loc) · 4.78 KB
/
ml_train.py
File metadata and controls
117 lines (105 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score, f1_score
# Set seed for reproducibility
np.random.seed(42)
def load_code_dataset(dataset_roots, file_exts=(".py", ".java", ".js")):
data = []
labels_set = set()
for root in dataset_roots:
if not os.path.exists(root):
print(f"[WARNING] Dataset root not found: {root}")
continue
for folder in sorted(os.listdir(root)):
folder_path = os.path.join(root, folder)
if not os.path.isdir(folder_path):
continue
# Normalize labels to lowercase for consistency
label = folder.lower()
labels_set.add(label)
file_count = 0
for ext in file_exts:
for file in Path(folder_path).glob(f"*{ext}"):
try:
with open(file, "r", encoding="utf-8", errors="ignore") as f:
code = f.read().strip()
if code:
data.append({"code": code, "label": label})
file_count += 1
except Exception as e:
print(f"[WARNING] Could not read {file}: {e}")
print(f"[INFO] Loaded {file_count} files from {folder}/ -> {label}")
if not data:
raise ValueError("No valid files found!")
df = pd.DataFrame(data)
print(f"[INFO] Total dataset: {len(df)} samples")
print(f"[INFO] Label distribution:\n{df['label'].value_counts()}")
return df, sorted(labels_set)
# Dataset paths
DATASET_ROOTS = [
r"D:\Projects\Python\Code_Detector\Dataset\Python",
r"D:\Projects\Python\Code_Detector\Dataset\Java",
r"D:\Projects\Python\Code_Detector\Dataset\JS"
]
OUTPUT_DIR = r"D:\Projects\Python\Code_Detector\model"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("[INFO] Loading dataset...")
df, labels = load_code_dataset(DATASET_ROOTS)
print(f"[INFO] Dataset size: {len(df)}, Labels: {labels}")
# Vectorizer
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3, 5))
X = vectorizer.fit_transform(df["code"])
y = df["label"]
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)
print(f"[INFO] Split sizes -> Train: {len(y_train)}, Val: {len(y_val)}, Test: {len(y_test)}")
# Class weights
classes = np.unique(y_train)
weights = compute_class_weight("balanced", classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, weights))
print(f"[INFO] Class weights: {class_weight_dict}")
# Models
models = {
"LogisticRegression": LogisticRegression(max_iter=2000, class_weight=class_weight_dict),
"RandomForest": RandomForestClassifier(n_estimators=200, class_weight=class_weight_dict, random_state=42, n_jobs=-1),
"GradientBoosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42),
"XGBoost": XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42, use_label_encoder=False, eval_metric="logloss")
}
# Encode labels for XGBoost
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)
y_test_enc = le.transform(y_test)
# Training and evaluation
for name, model in models.items():
print(f"\n[INFO] Training {name}...")
if name == "XGBoost":
model.fit(X_train, y_train_enc)
y_pred = model.predict(X_test)
print("[INFO] Test set metrics:")
print(f"Accuracy: {accuracy_score(y_test_enc, y_pred):.4f}")
print(f"F1 Score (Macro): {f1_score(y_test_enc, y_pred, average='macro'):.4f}")
else:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("[INFO] Test set metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score (Macro): {f1_score(y_test, y_pred, average='macro'):.4f}")
save_path = os.path.join(OUTPUT_DIR, f"{name.lower()}.pkl")
joblib.dump(model, save_path)
print(f"[INFO] Saved {name} to {save_path}")
# Save vectorizer and label encoder
joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, "vectorizer.pkl"))
joblib.dump(le, os.path.join(OUTPUT_DIR, "labelencoder.pkl"))
print("[INFO] Saved vectorizer and label encoder.")