-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_dataset.py
More file actions
78 lines (60 loc) · 2.77 KB
/
Copy pathprepare_dataset.py
File metadata and controls
78 lines (60 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import datasets
from datasets import Dataset, load_dataset, concatenate_datasets
import utils
from excel_opearations import ExcelOperations
from models import ZeroShotModels
from pcapoperations import PcapOperations
class PrepareData:
def __init__(self):
self.excel_opearations = ExcelOperations()
self.base_truth = self.excel_opearations.read_xlsx()
self.processed = []
def prepare_mix_data(self, data):
for obj in data:
filename = obj["file_name"]
result_list = self.base_truth[filename]
for i, res in enumerate(obj["result"]):
payload = utils.generate_prompt(res["protocol"], res["payload"])
if result_list[i]:
self.processed.append({"text": payload, "label": "attack"})
else:
self.processed.append({"text": payload, "label": "normal"})
def prepare_not_malicious_data(self, data, limit=True):
# lets keep exactly non-malicious data at 30% and 70% malicious as mix has some non-malicious
current_payload = len(self.processed)
count = 0
max_limit = 0
if limit:
max_limit = current_payload / 3
for obj in data:
for res in obj["result"]:
if res["payload"] =='unknown' or res["payload"] == '' or res["payload"] == 'unknown\n' or res[
"payload"] == '\n':
continue
self.processed.append({"text": utils.generate_prompt(res["protocol"],
res["payload"]),
"label": "normal"})
if limit:
count += 1
if count >= max_limit:
return
def writeCsv(self, filename):
self.excel_opearations.write_csv(self.processed, filename)
data = PrepareData()
zero_shot = ZeroShotModels()
pcap = PcapOperations()
model = zero_shot.get_models_by_suffix("llama-2-7b")[0]
# pcap.process_files(model, './inputs/Non-Malicious/', False, False)
# directly use the non-malicious one already we read
# data.prepare_not_malicious_data(model["train"], False)
# data.writeCsv("./data/normal.csv")
# candidate_labels = ["attack", "normal"]
#
candidate_labels = ["attack", "normal"]
dataset = load_dataset('csv', data_files={'test': './data/normal.csv'},features=datasets.Features(
{'text': datasets.Value('string'), 'label': datasets.ClassLabel(num_classes=2, names=candidate_labels)}))
full_dataset = load_dataset("niting3c/Malicious_packets")
test_data = full_dataset["test"]
new_dataset = concatenate_datasets([full_dataset["test"], dataset["test"]])
full_dataset["test"] = new_dataset
full_dataset.push_to_hub("niting3c/Malicious_packets")