Skip to content

Commit 207c8cb

Browse files
Tool big fixes and QoL changes
Fixed tensor device type mismatch and impossible accuracy values Updated model loading to unpack state dictionary in Test_Model Add sensitive and nonsensitive text datasets for model testing and visualization
1 parent 6a86e14 commit 207c8cb

File tree

4 files changed

+130
-128
lines changed

4 files changed

+130
-128
lines changed

todo

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
For future version 4n2
2+
3+
Private key → Pred = 0.000
4+
System IP → Pred = 0.000
5+
Private SSH key → Pred = 0.000
6+
7+
Do include these in future version 4n2

tools/Study_Models.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from torch.utils.data import DataLoader, Dataset
1313
from torchviz import make_dot
1414

15+
from data import test_texts, test_labels
16+
1517
# ----------------- Setup -----------------
1618
NAME = "Model_Sense.4n1"
1719
ROUND = 5
@@ -291,24 +293,13 @@ def save_model_summary(model_, filename="Model_Summary.txt"):
291293
print("Running visualize_activations...")
292294
visualize_activations(model, sample_input)
293295
print("Preparing texts and labels for t-SNE custom visualization...")
294-
texts = [
295-
# Non-sensitive (0)
296-
"I need to buy milk and bread from the grocery store.",
297-
"The weather is nice today, let's go for a walk.",
298-
"Python is my favorite programming language.",
299-
300-
# Sensitive (1)
301-
"My credit card number is 4929 1234 5678 9012.",
302-
"The patient's medical history includes diabetes and hypertension.",
303-
"My social security number is 123-45-6789."
304-
]
305-
labels = [0, 0, 0, 1, 1, 1]
296+
306297
print("Loading SentenceTransformer embedder...")
307298
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
308299
print("Running visualize_tsne...")
309300
visualize_tsne(model, dataloader)
310301
print("Running visualize_tsne_custom...")
311-
visualize_tsne_custom(model, embedder, texts, labels)
302+
visualize_tsne_custom(model, embedder, test_texts, test_labels)
312303
print("Running visualize_feature_importance...")
313304
visualize_feature_importance(input_dim)
314305
print("Running plot_loss_landscape_3d...")

tools/Test_Model.py

Lines changed: 14 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
import glob
2+
import os
13
import sys
24

35
import torch
46
from sentence_transformers import SentenceTransformer
7+
8+
from data import test_texts, test_labels
59
from vulnscan import SimpleNN
6-
import glob
7-
import os
10+
811
# ---------------- INIT ----------------
912
NAME = "Model_Sense.4n1"
1013
ROUND = 5
@@ -36,16 +39,17 @@ def load_embeddings(folder_path, pattern):
3639

3740
# Load all train/test/val embeddings
3841
train_embeddings, train_labels = load_embeddings(cache_dir, "train_*.pt")
39-
test_embeddings, test_labels = load_embeddings(cache_dir, "test_*.pt")
42+
test_embeddings, embed_test_labels = load_embeddings(cache_dir, "test_*.pt")
4043
val_embeddings, val_labels = load_embeddings(cache_dir, "validation_*.pt")
4144

4245
# Initialize model
4346
input_dim = train_embeddings.shape[1]
4447
model = SimpleNN(input_dim=input_dim).to(device)
45-
model.load_state_dict(torch.load(
48+
state = torch.load(
4649
f"../cache/{NAME}/round_{ROUND}/{NAME}_round{ROUND}.pth",
4750
map_location="cpu"
48-
))
51+
)
52+
model.load_state_dict(state["model_state_dict"])
4953
model.eval()
5054

5155
# Load SentenceTransformer
@@ -60,119 +64,14 @@ def load_embeddings(folder_path, pattern):
6064
print("Sample predictions vs true labels (first 50%):")
6165
sample_size = len(preds) // 2
6266
for i in range(sample_size):
63-
print(f" Pred={preds[i].item():.3f} | Label={test_labels[i].item()}")
67+
print(f" Pred={preds[i].item():.3f} | Label={embed_test_labels[i].item()}")
6468
# Calculate accuracy
65-
pred_labels = (preds >= 0.5).long().cpu()
66-
accuracy = (pred_labels == test_labels).sum().item() / len(test_labels)
67-
print(f"\nAccuracy on stored embeddings: {accuracy * 100:.2f}%")
69+
pred_labels = (preds >= 0.5).long().view(-1).cpu()
70+
true_labels = embed_test_labels.view(-1).cpu()
71+
accuracy = (pred_labels == true_labels).float().mean().item()
72+
print(f"Accuracy on stored embeddings: {accuracy*100:.2f}%")
6873

6974
# ---------------- 2. TEST ON NATURAL EXAMPLES ----------------
70-
sensitive_texts = [
71-
"My SSN is 123-45-6789",
72-
"Here is my credit card number: 4111 1111 1111 1111",
73-
"Password: hunter2",
74-
"My email is johndoe@gmail.com",
75-
"Bank account number: 987654321",
76-
"Private key: -----BEGIN RSA PRIVATE KEY-----",
77-
"The system IP is 192.168.1.1",
78-
"Contact me at +1-202-555-0147",
79-
"Visa card exp 12/25 CVV 123",
80-
"Secret API key: sk_test_4eC39HqLyjWDarjtT1zdp7dc",
81-
"My passport number is X1234567",
82-
"Driver's license ID: D12345678",
83-
"Social security info: 987-65-4321",
84-
"Encrypted key: 6f1e9a2c3d4b5e6f7a8b",
85-
"Database password: P@ssw0rd123",
86-
"Credit card CVV: 321",
87-
"Bank routing number: 021000021",
88-
"Private SSH key: -----BEGIN OPENSSH PRIVATE KEY-----",
89-
"My personal address: 123 Main St, Springfield",
90-
"Medical record ID: MR123456",
91-
"Tax ID: 123-45-6789",
92-
"Company login password: Admin@2025",
93-
"Phone PIN: 4321",
94-
"Wi-Fi password: mysecretwifi",
95-
"API token: abc123def456",
96-
"OAuth secret: 987zyx654",
97-
"PIN code: 2468",
98-
"Student ID: S123456789",
99-
"Mother's maiden name: Smith",
100-
"Credit card expiration: 10/27",
101-
"Bank account PIN: 5678",
102-
"Encrypted password hash: $2b$12$abcd1234...",
103-
"My vehicle VIN: 1HGCM82633A004352",
104-
"Passport expiration: 09/2030",
105-
"Private email password: qwerty123",
106-
"Debit card number: 5500 0000 0000 0004",
107-
"Bank security code: 789",
108-
"My date of birth: 1990-01-01",
109-
"Secret question answer: Blue",
110-
"Corporate VPN password: VPN@1234",
111-
"My home phone: +1-555-123-4567",
112-
"Bank card CVV2: 456",
113-
"Employee SSN: 234-56-7890",
114-
"Private notes: Login credentials",
115-
"Access key: AKIAIOSFODNN7EXAMPLE",
116-
"Credit card PIN: 1357",
117-
"Encrypted token: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
118-
"My secondary email password: pass1234",
119-
"Personal tax record: TR987654321",
120-
]
121-
nonsensitive_texts = [
122-
"The sky is blue today",
123-
"I had pasta for lunch",
124-
"Cats are very playful animals",
125-
"The capital of France is Paris",
126-
"I enjoy playing video games",
127-
"Tomorrow will be sunny",
128-
"Mount Everest is the tallest mountain",
129-
"Water boils at 100 degrees Celsius",
130-
"I love listening to music",
131-
"Python is a programming language",
132-
"I walked in the park yesterday",
133-
"Coffee tastes great in the morning",
134-
"The train arrives at 9 AM",
135-
"My favorite color is green",
136-
"Birds are singing outside",
137-
"The museum opens at 10 AM",
138-
"I like reading science fiction novels",
139-
"The ocean waves are calming",
140-
"I bought a new notebook",
141-
"Chocolate ice cream is delicious",
142-
"The book is on the table",
143-
"I enjoy jogging in the evening",
144-
"The concert was amazing",
145-
"I prefer tea over coffee",
146-
"Clouds are moving quickly",
147-
"The cat is sleeping on the sofa",
148-
"I painted a landscape yesterday",
149-
"The sun sets in the west",
150-
"I visited the mountains last summer",
151-
"Reading improves vocabulary",
152-
"The flowers bloom in spring",
153-
"I wrote a poem today",
154-
"Dogs are loyal pets",
155-
"The bakery smells wonderful",
156-
"I attended a workshop on AI",
157-
"The painting has vibrant colors",
158-
"I love hiking in the forest",
159-
"The classroom is very bright",
160-
"I learned a new recipe today",
161-
"The movie was entertaining",
162-
"I played chess with a friend",
163-
"The library has many books",
164-
"I enjoy listening to jazz music",
165-
"The playground is full of kids",
166-
"I watched a documentary yesterday",
167-
"The stars are shining tonight",
168-
"I planted a tree in the backyard",
169-
"The festival was fun and lively",
170-
"I took a photography course",
171-
"The coffee shop is near my office",
172-
]
173-
test_texts = sensitive_texts + nonsensitive_texts
174-
test_labels = [1] * len(sensitive_texts) + [0] * len(nonsensitive_texts)
175-
17675
# Encode with SentenceTransformer
17776
with torch.no_grad():
17877
test_embs = embed_model.encode(test_texts, convert_to_tensor=True, device=device)

tools/data.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
sensitive_texts = [
2+
"My SSN is 123-45-6789",
3+
"Here is my credit card number: 4111 1111 1111 1111",
4+
"Password: hunter2",
5+
"My email is johndoe@gmail.com",
6+
"Bank account number: 987654321",
7+
"Private key: -----BEGIN RSA PRIVATE KEY-----",
8+
"The system IP is 192.168.1.1",
9+
"Contact me at +1-202-555-0147",
10+
"Visa card exp 12/25 CVV 123",
11+
"Secret API key: sk_test_4eC39HqLyjWDarjtT1zdp7dc",
12+
"My passport number is X1234567",
13+
"Driver's license ID: D12345678",
14+
"Social security info: 987-65-4321",
15+
"Encrypted key: 6f1e9a2c3d4b5e6f7a8b",
16+
"Database password: P@ssw0rd123",
17+
"Credit card CVV: 321",
18+
"Bank routing number: 021000021",
19+
"Private SSH key: -----BEGIN OPENSSH PRIVATE KEY-----",
20+
"My personal address: 123 Main St, Springfield",
21+
"Medical record ID: MR123456",
22+
"Tax ID: 123-45-6789",
23+
"Company login password: Admin@2025",
24+
"Phone PIN: 4321",
25+
"Wi-Fi password: mysecretwifi",
26+
"API token: abc123def456",
27+
"OAuth secret: 987zyx654",
28+
"PIN code: 2468",
29+
"Student ID: S123456789",
30+
"Mother's maiden name: Smith",
31+
"Credit card expiration: 10/27",
32+
"Bank account PIN: 5678",
33+
"Encrypted password hash: $2b$12$abcd1234...",
34+
"My vehicle VIN: 1HGCM82633A004352",
35+
"Passport expiration: 09/2030",
36+
"Private email password: qwerty123",
37+
"Debit card number: 5500 0000 0000 0004",
38+
"Bank security code: 789",
39+
"My date of birth: 1990-01-01",
40+
"Secret question answer: Blue",
41+
"Corporate VPN password: VPN@1234",
42+
"My home phone: +1-555-123-4567",
43+
"Bank card CVV2: 456",
44+
"Employee SSN: 234-56-7890",
45+
"Private notes: Login credentials",
46+
"Access key: AKIAIOSFODNN7EXAMPLE",
47+
"Credit card PIN: 1357",
48+
"Encrypted token: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
49+
"My secondary email password: pass1234",
50+
"Personal tax record: TR987654321",
51+
]
52+
nonsensitive_texts = [
53+
"The sky is blue today",
54+
"I had pasta for lunch",
55+
"Cats are very playful animals",
56+
"The capital of France is Paris",
57+
"I enjoy playing video games",
58+
"Tomorrow will be sunny",
59+
"Mount Everest is the tallest mountain",
60+
"Water boils at 100 degrees Celsius",
61+
"I love listening to music",
62+
"Python is a programming language",
63+
"I walked in the park yesterday",
64+
"Coffee tastes great in the morning",
65+
"The train arrives at 9 AM",
66+
"My favorite color is green",
67+
"Birds are singing outside",
68+
"The museum opens at 10 AM",
69+
"I like reading science fiction novels",
70+
"The ocean waves are calming",
71+
"I bought a new notebook",
72+
"Chocolate ice cream is delicious",
73+
"The book is on the table",
74+
"I enjoy jogging in the evening",
75+
"The concert was amazing",
76+
"I prefer tea over coffee",
77+
"Clouds are moving quickly",
78+
"The cat is sleeping on the sofa",
79+
"I painted a landscape yesterday",
80+
"The sun sets in the west",
81+
"I visited the mountains last summer",
82+
"Reading improves vocabulary",
83+
"The flowers bloom in spring",
84+
"I wrote a poem today",
85+
"Dogs are loyal pets",
86+
"The bakery smells wonderful",
87+
"I attended a workshop on AI",
88+
"The painting has vibrant colors",
89+
"I love hiking in the forest",
90+
"The classroom is very bright",
91+
"I learned a new recipe today",
92+
"The movie was entertaining",
93+
"I played chess with a friend",
94+
"The library has many books",
95+
"I enjoy listening to jazz music",
96+
"The playground is full of kids",
97+
"I watched a documentary yesterday",
98+
"The stars are shining tonight",
99+
"I planted a tree in the backyard",
100+
"The festival was fun and lively",
101+
"I took a photography course",
102+
"The coffee shop is near my office",
103+
]
104+
test_texts = sensitive_texts + nonsensitive_texts
105+
test_labels = [1] * len(sensitive_texts) + [0] * len(nonsensitive_texts)

0 commit comments

Comments
 (0)