Skip to content

Commit 54894eb

Browse files
Merge pull request #2 from DefinetlyNotAI/v4
2 parents 4a8cb42 + d0f84fb commit 54894eb

23 files changed

+1463
-1435
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,7 @@ cython_debug/
205205
marimo/_static/
206206
marimo/_lsp/
207207
__marimo__/
208+
/models/
209+
/cache/
210+
/data/
211+
/*Data_Visualization/

.idea/VulnScan.iml

Lines changed: 6 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/dictionaries/project.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/inspectionProfiles/profiles_settings.xml

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Generator.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import os
2+
import torch
3+
from transformers import AutoTokenizer, AutoModelForCausalLM
4+
from vulnscan import DataGen, TrainingConfig, log
5+
6+
# ---------------- CONFIG ----------------
7+
# noinspection DuplicatedCode
8+
cfg = TrainingConfig()
9+
cfg.update({
10+
"MODEL_NAME": "Model_Sense.4n1",
11+
"BATCH_SIZE": 32,
12+
"MAX_EPOCHS": 35,
13+
"TRAIN_LOOPS": 3,
14+
"EARLY_STOPPING_PATIENCE": 5,
15+
"LR": 1e-3,
16+
"LR_JUMP": {"MAX": 5, "MIN": 0.1},
17+
"COUNTER": {"PATIENCE": 0, "JUMP": 0},
18+
"JUMP_PATIENCE": 3,
19+
"LR_DECAY": 0.9,
20+
"AUTO_CONTINUE": False,
21+
"DATASET_SIZE": 25000,
22+
"TEXT_MAX_LEN": 128,
23+
"TEXT_MAX_LEN_JUMP_RANGE": 10,
24+
"VAL_SPLIT": 0.85,
25+
"TRAIN_VAL_SPLIT": 0.8,
26+
"SENSITIVE_PROB": 0.5,
27+
"TOP_K": 30,
28+
"TOP_P": 0.9,
29+
"TEMPERATURE": 0.9,
30+
"REP_PENALTY": 1.2,
31+
"RETRY_LIMIT": 3,
32+
"RAM_THRESHOLD": 0.85
33+
})
34+
35+
# ---------------- MODEL ----------------
36+
log(message="Loading GPT-Neo model for text generation...", cfg=cfg)
37+
gpt_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
38+
gpt_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").to(cfg.DEVICE)
39+
if gpt_tokenizer.pad_token is None:
40+
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
41+
42+
# ---------------- DATASET GENERATION ----------------
43+
dataset_ranges = [10, 100, 1000, 5000, 10000, 17500, 25000]
44+
45+
for dr in dataset_ranges:
46+
dataset_path = f"{cfg.DATASET_CACHE_DIR}/dataset_{dr}.pt"
47+
48+
# Skip if already exists
49+
if os.path.exists(dataset_path):
50+
data = torch.load(dataset_path, map_location="cpu")
51+
log(f"Found existing dataset {dataset_path} with {len(data['texts'])} samples. Skipping generation.", cfg=cfg)
52+
continue
53+
54+
# --- Find largest smaller dataset ---
55+
base_texts, base_labels = [], []
56+
smaller_existing = [r for r in dataset_ranges if r < dr]
57+
smaller_existing.sort(reverse=True)
58+
59+
for sr in smaller_existing:
60+
candidate_path = f"{cfg.DATASET_CACHE_DIR}/dataset_{sr}.pt"
61+
if os.path.exists(candidate_path):
62+
data = torch.load(candidate_path, map_location="cpu")
63+
base_texts, base_labels = data["texts"], data["labels"]
64+
log(f"Using {candidate_path} as base with {len(base_texts)} samples.", cfg=cfg)
65+
break
66+
67+
# How many new we need
68+
remaining = dr - len(base_texts)
69+
if remaining <= 0:
70+
log(f"Already have enough samples for {dr}, just saving subset.", cfg=cfg)
71+
torch.save({"texts": base_texts[:dr], "labels": base_labels[:dr]}, dataset_path)
72+
continue
73+
74+
cfg.update({"DATASET_SIZE": remaining})
75+
log(f"Generating {remaining} new samples for dataset {dr}...", cfg=cfg)
76+
generate = DataGen(cfg=cfg)
77+
78+
try:
79+
new_texts, new_labels = generate.dataset(gpt_tokenizer=gpt_tokenizer, gpt_model=gpt_model)
80+
except KeyboardInterrupt:
81+
base_texts.extend(new_texts)
82+
base_labels.extend(new_labels)
83+
torch.save({"texts": base_texts, "labels": base_labels}, dataset_path)
84+
log(f"Interrupted. Saved {len(base_texts)} samples so far to {dataset_path}", cfg=cfg)
85+
raise
86+
87+
# Save full dataset
88+
base_texts.extend(new_texts)
89+
base_labels.extend(new_labels)
90+
torch.save({"texts": base_texts, "labels": base_labels}, dataset_path)
91+
log(f"Saved dataset with {len(base_texts)} samples to {dataset_path}", cfg=cfg)

README.md

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@ This document outlines the system's naming conventions, lifecycle, and model con
2121
## Naming Conventions
2222

2323
### Model Naming Format
24-
`Model {Type of model} .{Version}`
24+
`Model_{Type of model}.{Version}`
2525

2626
- **Type of Model**: Describes the training data configuration.
27-
- `Sense`: Sensitive data set with 50k files, each 50KB in size.
28-
- `SenseNano`: Test set with 5-10 files, each 5KB, used for error-checking.
29-
- `SenseMacro`: Large dataset with 1M files, each 10KB. This is computationally intensive, so some corners were cut in training.
30-
- `SenseMini`: Dataset with 10K files, each between 10-200KB. Balanced size for effective training and resource efficiency.
27+
- `SenseNano`: Test set with <10k files or <1k vals (PT), used for error-checking.
28+
- `SenseMini`: Dataset with 10k to 50k files or 1k-5k vals (PT). `Balanced size for effective training and resource efficiency`.
29+
- `Sense`: Sensitive data set with 50k to 100k files or 5k-10k (PT).
30+
- `SenseMacro`: Large dataset with >100k files or >10k (PT).
3131

3232
- **Version Format**: `{Version#}{c}{Repeat#}`
3333
- **Version#**: Increment for major code updates.
@@ -128,20 +128,16 @@ as well as previously trained models for you to test out
128128
The repository is located [here](https://github.com/DefinetlyNotAI/VulnScan_Data).
129129

130130
The repository contains the following directories:
131-
- `Archived Models`: Contains the previously trained models. Is organized by the model type then version.
131+
- `cache`: Contains all training data generated by [`Generator.py`](Generator.py).
132132
- `NN features`: Contains information about the model `.3n3` and the vectorizer used. Information include:
133133
- `Documentation_Study_Network.md`: A markdown file that contains more info.
134-
- `Neural Network Nodes Graph.gexf`: A Gephi file that contains the model nodes and edges.
135-
- `Nodes and edges (GEPHI).csv`: A CSV file that contains the model nodes and edges.
136-
- `Statistics`: Directories made by Gephi, containing the statistics of the model nodes and edges.
134+
- `Neural_Network_Nodes_Graph.gexf`: A Gephi file that contains the model nodes and edges.
137135
- `Feature_Importance.svg`: A SVG file that contains the feature importance of the model.
138136
- `Loss_Landscape_3D.html`: A HTML file that contains the 3D loss landscape of the model.
139-
- `Model Accuracy Over Epochs.png` and `Model Loss Over Epochs.png`: PNG files that contain the model accuracy and loss over epochs.
140-
- `Model state dictionary.txt`: A text file that contains the model state dictionary.
141-
- `Model Summary.txt`: A text file that contains the model summary.
142-
- `Model Visualization.png`: A PNG file that contains the model visualization.
143-
- `Top_90_Features.svg`: A SVG file that contains the top 90 features of the model.
144-
- `Vectorizer features.txt`: A text file that contains the vectorizer features.
145-
- `Visualize Activation.png`: A PNG file that contains the visualization of the model activation.
146-
- `Visualize t-SNE.png`: A PNG file that contains the visualization of the model t-SNE.
147-
- `Weight Distribution.png`: A PNG file that contains the weight distribution of the model.
137+
- `Model_State_Dict.txt`: A text file that contains the model state dictionary.
138+
- `Model_Summary.txt`: A text file that contains the model summary.
139+
- `Model_Visualization.png`: A PNG file that contains the model visualization.
140+
- `Visualize_Activation.png`: A PNG file that contains the visualization of the model activation.
141+
- `Visualize_tSNE.png`: A PNG file that contains the visualization of the model t-SNE with the default training test embeds.
142+
- `Visualize_tSNE_custom.png`: A PNG file that contains the visualization of the model t-SNE with real world training examples (only 100).
143+
- `Weight_Distribution.png`: A PNG file that contains the weight distribution of the model.

Trainer.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
import json
2+
import os
3+
import sys
4+
5+
import torch
6+
from sentence_transformers import SentenceTransformer
7+
from torch.utils.data import DataLoader
8+
from transformers import AutoModelForCausalLM, AutoTokenizer
9+
10+
from vulnscan import log, Train, plot_training, SimpleNN, EmbeddingDataset, TrainingConfig, DataGen
11+
12+
13+
# ---------------- INIT ----------------
14+
def init(config: TrainingConfig) -> dict:
15+
"""Initialize static, config-free resources (only once)."""
16+
try:
17+
log("Loading GPT-Neo tokenizer/model (static init)...", cfg=config, only_console=True)
18+
gpt_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
19+
gpt_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
20+
if gpt_tokenizer.pad_token is None:
21+
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
22+
23+
log("Loading MiniLM for embeddings (static init)...", cfg=config, only_console=True)
24+
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
25+
26+
return {
27+
"gpt_tokenizer": gpt_tokenizer,
28+
"gpt_model": gpt_model,
29+
"embed_model": embed_model,
30+
}
31+
except KeyboardInterrupt:
32+
sys.exit("Interrupted by user in initialization.")
33+
except Exception as err:
34+
sys.exit(f"Error during initialization: {err}")
35+
36+
37+
# ---------------- TRAIN ----------------
38+
def train(config: TrainingConfig, resources: dict):
39+
part = "???"
40+
try:
41+
# Load resources from init
42+
part = "init resources loading"
43+
gpt_tokenizer = resources["gpt_tokenizer"]
44+
gpt_model = resources["gpt_model"].to(config.DEVICE) # attach to device here
45+
embed_model = resources["embed_model"]
46+
47+
# Initialise DataGen
48+
part = "initialising DataGen"
49+
log("Initialising DataGen with config...", cfg=config, silent=True)
50+
generate = DataGen(cfg=config)
51+
52+
# Generate dataset
53+
part = "generating/loading the dataset"
54+
dataset_path = f"{config.DATASET_CACHE_DIR}/dataset_{config.DATASET_SIZE}.pt"
55+
if os.path.exists(dataset_path):
56+
log("Loading existing dataset...", cfg=config)
57+
data = torch.load(dataset_path)
58+
texts, labels = data["texts"], data["labels"]
59+
else:
60+
log("Dataset not found, generating", cfg=config)
61+
texts, labels = generate.dataset(gpt_tokenizer=gpt_tokenizer, gpt_model=gpt_model)
62+
torch.save({"texts": texts, "labels": labels}, dataset_path)
63+
64+
# Split dataset
65+
part = "splitting the dataset"
66+
train_split = int(len(texts) * config.TRAIN_VAL_SPLIT)
67+
val_split = int(len(texts) * config.VAL_SPLIT)
68+
69+
train_texts, train_labels = texts[:train_split], labels[:train_split]
70+
val_texts, val_labels = texts[train_split:val_split], labels[train_split:val_split]
71+
test_texts, test_labels = texts[val_split:], labels[val_split:]
72+
73+
# Generate embeddings for all splits
74+
part = "generating the embeddings"
75+
log("Generating test embeddings...", cfg=config)
76+
generate.embeddings(embed_model=embed_model, texts=test_texts, labels=test_labels, split="test")
77+
log("Generating train embeddings...", cfg=config)
78+
generate.embeddings(embed_model=embed_model, texts=train_texts, labels=train_labels, split="train")
79+
log("Generating validation embeddings...", cfg=config)
80+
generate.embeddings(embed_model=embed_model, texts=val_texts, labels=val_labels, split="validation")
81+
82+
# Prepare datasets and dataloaders
83+
part = "preparing datasets and dataloaders"
84+
train_dataset = EmbeddingDataset(config.EMBED_CACHE_DIR)
85+
val_dataset = EmbeddingDataset(config.EMBED_CACHE_DIR)
86+
val_loader = DataLoader(dataset=val_dataset, batch_size=config.BATCH_SIZE, shuffle=False)
87+
88+
train_ = Train(cfg=config)
89+
model = SimpleNN(input_dim=384).to(config.DEVICE)
90+
91+
# Run training (handles TRAIN_LOOPS internally)
92+
part = "training the model"
93+
history_loops = train_.model(model=model, train_dataset=train_dataset, val_loader=val_loader)
94+
95+
# Plot + save history for each loop
96+
part = "plotting and saving training history"
97+
for i, history in enumerate(history_loops):
98+
plot_training(cfg=config, history_loops=history_loops)
99+
with open(
100+
f"{config.CACHE_DIR}/{config.MODEL_NAME}/round_{config.MODEL_ROUND}/training_history_loop{i + 1}.json",
101+
"w") as f:
102+
json.dump(history, f)
103+
104+
log("Training complete. All data, plots, and model saved.", cfg=config)
105+
except KeyboardInterrupt:
106+
sys.exit("Interrupted by user during training.")
107+
except Exception as err:
108+
sys.exit(f"Error during '{part}': {err}")
109+
110+
111+
if __name__ == "__main__":
112+
# noinspection DuplicatedCode
113+
# ---------------- CONFIG ----------------
114+
cfg = TrainingConfig()
115+
cfg.update({
116+
# Training parameters
117+
"BATCH_SIZE": 32, # Number of samples per training batch
118+
"MAX_EPOCHS": 35, # Maximum number of training epochs
119+
"TRAIN_LOOPS": 3, # Number of training loops (full dataset passes, with improvement measures)
120+
"EARLY_STOPPING_PATIENCE": 5, # Number of epochs to wait for improvement before premature stopping
121+
"LR": 1e-3, # Initial learning rate
122+
"LR_JUMP": {"MAX": 5, "MIN": 0.1}, # Upper and lower limits for learning rate jumps
123+
"COUNTER": {"PATIENCE": 0, "JUMP": 0}, # Counters for early stopping patience and learning rate jumps
124+
"JUMP_PATIENCE": 3, # Epochs to wait before applying a learning rate jump
125+
"LR_DECAY": 0.9, # Factor to multiply learning rate after decay
126+
"AUTO_CONTINUE": False, # Whether to automatically continue training and ignore EARLY_STOPPING_PATIENCE
127+
128+
# Number of samples to generate for training (not the same as for the training rounds themselves)
129+
"TEXT_MAX_LEN": 128, # Maximum length of generated text samples
130+
"TEXT_MAX_LEN_JUMP_RANGE": 10, # Range for random variation in text length
131+
"VAL_SPLIT": 0.85, # Fraction of dataset used for training + validation (rest for testing)
132+
"TRAIN_VAL_SPLIT": 0.8, # Fraction of dataset used for training (rest for validation)
133+
"SENSITIVE_PROB": 0.5, # Probability that a sample contains sensitive data
134+
135+
# Language / generation
136+
"TOP_K": 30, # Top-K sampling: only consider this many top predictions
137+
"TOP_P": 0.9, # Top-p (nucleus) sampling probability
138+
"TEMPERATURE": 0.9, # Sampling temperature for randomness
139+
"REP_PENALTY": 1.2, # Repetition penalty to reduce repeated tokens
140+
"RETRY_LIMIT": 3, # Number of times to retry generation if it fails
141+
142+
# Device / system
143+
"RAM_THRESHOLD": 0.85 # Maximum allowed fraction of RAM usage before halting generation and offloading
144+
})
145+
train_init = init(cfg)
146+
147+
# ----------------- RUN ------------------
148+
try:
149+
available_dataset = [10, 100, 1000, 5000, 10000, 17500, 25000]
150+
for loop_idx, dataset in enumerate(available_dataset, start=1):
151+
if dataset <= 1000:
152+
name = "SenseNano"
153+
elif 1000 < dataset <= 5000:
154+
name = "SenseMini"
155+
elif 5000 < dataset <= 10000:
156+
name = "Sense"
157+
else:
158+
name = "SenseMacro"
159+
model_round = loop_idx
160+
cfg.update({
161+
# Model / caching / logging
162+
"MODEL_NAME": f"Model_{name}.4n1", # Name of the model for identification and caching
163+
"DATASET_SIZE": dataset,
164+
# Number of samples to generate for training (not the same as for the training rounds themselves)
165+
"MODEL_ROUND": model_round # Current training round (auto-incremented)
166+
})
167+
log(message=f"Training 'Model_{name}.4n1/round_{model_round}/' with {dataset} dataset...", cfg=cfg)
168+
train(config=cfg, resources=train_init)
169+
except KeyboardInterrupt:
170+
sys.exit("Interrupted by user in main.")
171+
except Exception as e:
172+
sys.exit(f"Error during training: {e}")

requirements.txt

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
1+
scikit-learn~=1.7.1
2+
tabulate~=0.9.0
3+
tqdm~=4.67.1
4+
transformers~=4.55.4
15
joblib~=1.3.2
2-
matplotlib~=3.10.1
3-
torch~=2.8.0
4-
xgboost~=2.1.4
56
configparser~=7.1.0
6-
scikit-learn~=1.6.1
7-
Faker~=36.1.1
7+
matplotlib~=3.10.1
88
networkx~=3.2.1
9-
numpy~=2.2.3
10-
plotly~=6.0.0
9+
numpy~=2.3.2
10+
plotly~=6.3.0
1111
seaborn~=0.13.2
12-
torchviz~=0.0.3
13-
14-
tqdm~=4.66.6
12+
Faker~=36.1.1
13+
torchviz~=0.0.3

0 commit comments

Comments
 (0)