|
| 1 | +""" |
| 2 | +SpamShield v3.1 - SMS Spam Classifier |
| 3 | +AI-Powered SMS Spam Detection |
| 4 | +Auto-downloads the SMSSpamCollection dataset if missing |
| 5 | +Efficient batch classification from CSV/TXT |
| 6 | +""" |
| 7 | + |
| 8 | +import os, sys, threading, csv |
| 9 | +import tkinter as tk |
| 10 | +from tkinter import filedialog, messagebox, ttk |
| 11 | + |
| 12 | +import ttkbootstrap as tb |
| 13 | +from ttkbootstrap.constants import * |
| 14 | + |
| 15 | +try: |
| 16 | + from tkinterdnd2 import TkinterDnD, DND_FILES |
| 17 | + DND_ENABLED = True |
| 18 | +except ImportError: |
| 19 | + DND_ENABLED = False |
| 20 | + print("Drag & Drop requires tkinterdnd2: pip install tkinterdnd2") |
| 21 | + |
| 22 | +# ML libs |
| 23 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 24 | +from sklearn.naive_bayes import MultinomialNB |
| 25 | +from sklearn.pipeline import make_pipeline |
| 26 | +from sklearn.model_selection import train_test_split |
| 27 | +from sklearn.metrics import accuracy_score |
| 28 | +import pandas as pd |
| 29 | +import joblib |
| 30 | + |
| 31 | +import urllib.request |
| 32 | +import zipfile |
| 33 | + |
| 34 | +# ---------------------- UTIL ---------------------- |
| 35 | + |
| 36 | +def resource_path(file_name): |
| 37 | + base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))) |
| 38 | + return os.path.join(base_path, file_name) |
| 39 | + |
| 40 | +def download_dataset(): |
| 41 | + url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip" |
| 42 | + zip_path = resource_path("smsspamcollection.zip") |
| 43 | + try: |
| 44 | + urllib.request.urlretrieve(url, zip_path) |
| 45 | + except Exception as e: |
| 46 | + messagebox.showerror("Download Failed", f"Failed to download dataset:\n{e}") |
| 47 | + sys.exit(1) |
| 48 | + |
| 49 | + with zipfile.ZipFile(zip_path, 'r') as z: |
| 50 | + z.extractall(resource_path("")) |
| 51 | + os.remove(zip_path) |
| 52 | + |
| 53 | +# ---------------------- ML MODEL ---------------------- |
| 54 | + |
| 55 | +def train_sms_model(): |
| 56 | + ds_path = resource_path("SMSSpamCollection") |
| 57 | + if not os.path.exists(ds_path): |
| 58 | + download_dataset() |
| 59 | + |
| 60 | + df = pd.read_csv(ds_path, sep="\t", header=None, names=["label", "text"]) |
| 61 | + df["label_num"] = df["label"].map({"ham": 0, "spam": 1}) |
| 62 | + |
| 63 | + X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label_num"], |
| 64 | + test_size=0.2, random_state=42) |
| 65 | + |
| 66 | + model = make_pipeline(TfidfVectorizer(), MultinomialNB()) |
| 67 | + model.fit(X_train, y_train) |
| 68 | + |
| 69 | + y_pred = model.predict(X_test) |
| 70 | + print(f"[INFO] Model trained — Test Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%") |
| 71 | + |
| 72 | + model_path = resource_path("sms_spam_model.pkl") |
| 73 | + joblib.dump(model, model_path) |
| 74 | + |
| 75 | + return model |
| 76 | + |
| 77 | +def load_model(): |
| 78 | + model_path = resource_path("sms_spam_model.pkl") |
| 79 | + if os.path.exists(model_path): |
| 80 | + return joblib.load(model_path) |
| 81 | + return train_sms_model() |
| 82 | + |
| 83 | +# ---------------------- SPAM WORKER ---------------------- |
| 84 | + |
| 85 | +class SpamWorker: |
| 86 | + def __init__(self, files, model, callbacks, max_results=200_000): |
| 87 | + self.files = files |
| 88 | + self.model = model |
| 89 | + self.callbacks = callbacks |
| 90 | + self.max_results = max_results |
| 91 | + self._running = True |
| 92 | + |
| 93 | + def stop(self): |
| 94 | + self._running = False |
| 95 | + |
| 96 | + def run(self): |
| 97 | + total_files = len(self.files) |
| 98 | + stats = {"TOTAL": 0, "SPAM": 0, "HAM": 0} |
| 99 | + |
| 100 | + for i, path in enumerate(self.files): |
| 101 | + if not self._running: |
| 102 | + break |
| 103 | + |
| 104 | + try: |
| 105 | + texts = [] |
| 106 | + with open(path, newline="", encoding="utf-8", errors="ignore") as f: |
| 107 | + reader = csv.reader(f) |
| 108 | + for row in reader: |
| 109 | + if row: |
| 110 | + texts.append(row[0].strip()) |
| 111 | + |
| 112 | + if texts: |
| 113 | + labels_num = self.model.predict(texts) |
| 114 | + labels = ["SPAM" if l == 1 else "HAM" for l in labels_num] |
| 115 | + |
| 116 | + for t, lbl in zip(texts, labels): |
| 117 | + stats[lbl] += 1 |
| 118 | + stats["TOTAL"] += 1 |
| 119 | + |
| 120 | + if "found" in self.callbacks: |
| 121 | + self.callbacks["found"](path, t, lbl) |
| 122 | + |
| 123 | + if stats["TOTAL"] >= self.max_results: |
| 124 | + break |
| 125 | + |
| 126 | + except Exception as e: |
| 127 | + print(f"[WARN] Could not read {path}: {e}") |
| 128 | + |
| 129 | + pct = int((i + 1) / total_files * 100) |
| 130 | + if "progress" in self.callbacks: |
| 131 | + self.callbacks["progress"](pct) |
| 132 | + if "stats" in self.callbacks: |
| 133 | + self.callbacks["stats"](dict(stats)) |
| 134 | + |
| 135 | + if "finished" in self.callbacks: |
| 136 | + self.callbacks["finished"]() |
| 137 | + |
| 138 | +# ---------------------- MAIN APP ---------------------- |
| 139 | + |
| 140 | +class SpamShieldApp: |
| 141 | + APP_NAME = "SpamShield" |
| 142 | + APP_VERSION = "3.1" |
| 143 | + SUPPORTED_EXT = (".csv", ".txt") |
| 144 | + |
| 145 | + def __init__(self): |
| 146 | + if DND_ENABLED: |
| 147 | + self.root = TkinterDnD.Tk() |
| 148 | + else: |
| 149 | + self.root = tb.Window(themename="darkly") |
| 150 | + |
| 151 | + self.root.title(f"{self.APP_NAME} v{self.APP_VERSION}") |
| 152 | + self.root.minsize(1200, 650) |
| 153 | + |
| 154 | + self.model = load_model() |
| 155 | + |
| 156 | + self.worker = None |
| 157 | + self.smooth = 0 |
| 158 | + self.target = 0 |
| 159 | + self.file_set = set() |
| 160 | + |
| 161 | + self._build_ui() |
| 162 | + self._apply_styles() |
| 163 | + |
| 164 | + def _build_ui(self): |
| 165 | + main = tb.Frame(self.root, padding=10) |
| 166 | + main.pack(fill=tk.BOTH, expand=True) |
| 167 | + |
| 168 | + tb.Label(main, text=f"📩 {self.APP_NAME} - AI SMS Spam Detector", |
| 169 | + font=("Segoe UI", 22, "bold")).pack(pady=(0,4)) |
| 170 | + |
| 171 | + tb.Label(main, text="Batch classification — handles large SMS datasets", |
| 172 | + font=("Segoe UI", 10, "italic"), foreground="#9ca3af").pack(pady=(0,12)) |
| 173 | + |
| 174 | + row1 = tb.Frame(main) |
| 175 | + row1.pack(fill=tk.X) |
| 176 | + |
| 177 | + self.path_input = tb.Entry(row1, width=90) |
| 178 | + self.path_input.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0,6)) |
| 179 | + self.path_input.insert(0, "Drag & drop CSV/TXT files with SMS data…") |
| 180 | + |
| 181 | + btn_browse = tb.Button(row1, text="📂 Browse", bootstyle=INFO, command=self.browse) |
| 182 | + btn_browse.pack(side=tk.LEFT, padx=3) |
| 183 | + |
| 184 | + self.btn_start = tb.Button(row1, text="🚀 Start", bootstyle=SUCCESS, command=self.start) |
| 185 | + self.btn_start.pack(side=tk.LEFT, padx=3) |
| 186 | + |
| 187 | + self.btn_cancel = tb.Button(row1, text="⏹ Cancel", bootstyle=DANGER, command=self.cancel) |
| 188 | + self.btn_cancel.pack(side=tk.LEFT, padx=3) |
| 189 | + self.btn_cancel.config(state=tk.DISABLED) |
| 190 | + |
| 191 | + btn_export = tb.Button(row1, text="💾 Export", bootstyle=PRIMARY, command=self.export_results) |
| 192 | + btn_export.pack(side=tk.LEFT, padx=3) |
| 193 | + |
| 194 | + btn_about = tb.Button(row1, text="ℹ️ About", bootstyle=INFO, command=self.show_about) |
| 195 | + btn_about.pack(side=tk.LEFT, padx=3) |
| 196 | + |
| 197 | + self.progress = tb.Progressbar(main, bootstyle="success-striped", maximum=100) |
| 198 | + self.progress.pack(fill=tk.X, pady=(6,6)) |
| 199 | + |
| 200 | + columns = ("selected", "text", "label") |
| 201 | + self.tree = ttk.Treeview(main, columns=columns, show="headings", height=20) |
| 202 | + self.tree.heading("selected", text="☑️") |
| 203 | + self.tree.heading("text", text="SMS Text") |
| 204 | + self.tree.heading("label", text="Prediction") |
| 205 | + |
| 206 | + self.tree.column("selected", width=50, anchor=tk.CENTER) |
| 207 | + self.tree.column("text", width=800) |
| 208 | + self.tree.column("label", width=120) |
| 209 | + |
| 210 | + self.tree.pack(fill=tk.BOTH, expand=True) |
| 211 | + |
| 212 | + self.stats_lbl = tb.Label(main, text="TOTAL: 0 | SPAM: 0 | HAM: 0") |
| 213 | + self.stats_lbl.pack(anchor=tk.E) |
| 214 | + |
| 215 | + self.root.after(15, self._anim_progress) |
| 216 | + |
| 217 | + if DND_ENABLED: |
| 218 | + self.tree.drop_target_register(DND_FILES) |
| 219 | + self.tree.dnd_bind("<<Drop>>", self.on_drop) |
| 220 | + |
| 221 | + # ---- File Queue ---- |
| 222 | + |
| 223 | + def browse(self): |
| 224 | + files = filedialog.askopenfilenames(title="Select SMS Data Files", |
| 225 | + filetypes=[("CSV Files","*.csv"), ("Text Files","*.txt")]) |
| 226 | + if files: |
| 227 | + self._queue_files(files) |
| 228 | + |
| 229 | + def on_drop(self, event): |
| 230 | + paths = self.root.tk.splitlist(event.data) |
| 231 | + self._queue_files(paths) |
| 232 | + |
| 233 | + def _queue_files(self, paths): |
| 234 | + for p in paths: |
| 235 | + ext = os.path.splitext(p)[1].lower() |
| 236 | + if ext in self.SUPPORTED_EXT and p not in self.file_set: |
| 237 | + self.file_set.add(p) |
| 238 | + self.tree.insert("", tk.END, values=("☑️", p, "Queued")) |
| 239 | + self.path_input.delete(0, tk.END) |
| 240 | + self.path_input.insert(0, f"{len(self.file_set)} files queued") |
| 241 | + |
| 242 | + # ---- Actions ---- |
| 243 | + |
| 244 | + def start(self): |
| 245 | + selected = [self.tree.item(i)["values"][1] for i in self.tree.get_children() |
| 246 | + if self.tree.item(i)["values"][0] == "☑️"] |
| 247 | + if not selected: |
| 248 | + messagebox.showwarning("No Data", "Select at least one file to classify.") |
| 249 | + return |
| 250 | + |
| 251 | + self.btn_start.config(state=tk.DISABLED) |
| 252 | + self.btn_cancel.config(state=tk.NORMAL) |
| 253 | + self.progress["value"] = 0 |
| 254 | + self.smooth = 0 |
| 255 | + self.target = 0 |
| 256 | + |
| 257 | + threading.Thread(target=self._run_worker, args=(selected,), daemon=True).start() |
| 258 | + |
| 259 | + def _run_worker(self, files): |
| 260 | + self.worker = SpamWorker(files, self.model, |
| 261 | + callbacks={"found": self._add_row, |
| 262 | + "progress": self._set_target, |
| 263 | + "stats": self._update_stats, |
| 264 | + "finished": self._finish}) |
| 265 | + self.worker.run() |
| 266 | + |
| 267 | + def _add_row(self, file, text, label): |
| 268 | + self.tree.insert("", tk.END, values=("☑️", text, label)) |
| 269 | + self.tree.tag_configure(label, foreground="#dc2626" if label == "SPAM" else "#4ade80") |
| 270 | + iid = self.tree.get_children()[-1] |
| 271 | + self.tree.item(iid, tags=(label,)) |
| 272 | + |
| 273 | + def _update_stats(self, stats): |
| 274 | + self.stats_lbl.config(text=f"TOTAL: {stats['TOTAL']} | SPAM: {stats['SPAM']} | HAM: {stats['HAM']}") |
| 275 | + |
| 276 | + def _set_target(self, v): |
| 277 | + self.target = v |
| 278 | + |
| 279 | + def _anim_progress(self): |
| 280 | + if self.smooth < self.target: |
| 281 | + self.smooth += 1 |
| 282 | + self.progress["value"] = self.smooth |
| 283 | + self.root.after(15, self._anim_progress) |
| 284 | + |
| 285 | + def cancel(self): |
| 286 | + if self.worker: |
| 287 | + self.worker.stop() |
| 288 | + self._finish() |
| 289 | + |
| 290 | + def _finish(self): |
| 291 | + self.btn_start.config(state=tk.NORMAL) |
| 292 | + self.btn_cancel.config(state=tk.DISABLED) |
| 293 | + self.progress["value"] = 100 |
| 294 | + |
| 295 | + # ---- Export ---- |
| 296 | + |
| 297 | + def export_results(self): |
| 298 | + rows = [self.tree.item(i)["values"] for i in self.tree.get_children() |
| 299 | + if self.tree.item(i)["values"][0] == "☑️"] |
| 300 | + if not rows: |
| 301 | + messagebox.showwarning("Export", "No classified messages to export.") |
| 302 | + return |
| 303 | + |
| 304 | + path = filedialog.asksaveasfilename(defaultextension=".txt", |
| 305 | + filetypes=[("Text Files","*.txt")]) |
| 306 | + if path: |
| 307 | + with open(path, "w", encoding="utf-8") as f: |
| 308 | + for _, text, lbl in rows: |
| 309 | + f.write(f"{text} | {lbl}\n") |
| 310 | + messagebox.showinfo("Export", "Results exported successfully!") |
| 311 | + |
| 312 | + # ---- About ---- |
| 313 | + |
| 314 | + def show_about(self): |
| 315 | + messagebox.showinfo( |
| 316 | + f"About {self.APP_NAME}", |
| 317 | + f"{self.APP_NAME} v{self.APP_VERSION}\n\n" |
| 318 | + "• Drag & drop SMS dataset files\n" |
| 319 | + "• Auto-downloads needed dataset\n" |
| 320 | + "• Batch ML classification\n" |
| 321 | + "• SPAM/HAM highlighting\n" |
| 322 | + "• Export results\n\n" |
| 323 | + "🏢 Built with ❤️" |
| 324 | + ) |
| 325 | + |
| 326 | + def _apply_styles(self): |
| 327 | + self.root.style = tb.Style(theme="darkly") |
| 328 | + self.root.style.configure("TProgressbar", troughcolor="#1b1f3a", |
| 329 | + background="#7c3aed", thickness=14) |
| 330 | + |
| 331 | + def run(self): |
| 332 | + self.root.mainloop() |
| 333 | + |
| 334 | +# ---- Run App ---- |
| 335 | + |
| 336 | +if __name__ == "__main__": |
| 337 | + app = SpamShieldApp() |
| 338 | + app.run() |
0 commit comments