-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_and_prepare_dataset.py
More file actions
78 lines (61 loc) · 2.65 KB
/
process_and_prepare_dataset.py
File metadata and controls
78 lines (61 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# ------------------------------------------------------------
# 🧪 process_and_prepare_dataset.py
# Dataset Cleaner + Feature Engineer for IDS Model Training
# Intel Unnati IDS | Authors : Arjun, Nimish and Shaurya | Version: 1.0
# ------------------------------------------------------------
import pandas as pd
import numpy as np
# ------------------------------------------------------------
# 📂 Load Raw CSVs
# ------------------------------------------------------------
attack_df = pd.read_csv("csv/attack_raw.csv")
normal_df = pd.read_csv("csv/normal_raw.csv")
# 🧹 Fill missing values with 0
attack_df.fillna(0, inplace=True)
normal_df.fillna(0, inplace=True)
# 🏷️ Add binary labels
attack_df["label"] = 1
normal_df["label"] = 0
# ------------------------------------------------------------
# 🔻 Downsample (to balance dataset)
# ------------------------------------------------------------
# Cap normal traffic to 100K rows
normal_df = normal_df.sample(n=100000, random_state=42)
# Cap attacks to 100K rows max
if len(attack_df) > 100000:
attack_df = attack_df.sample(n=100000, random_state=42)
# ------------------------------------------------------------
# 🧠 Smart Feature Engineering
# ------------------------------------------------------------
# Helper: Convert IP string to integer
def ip_to_int(ip):
try:
return int.from_bytes(bytes(map(int, str(ip).split("."))), "big")
except:
return 0
# Convert IP columns
for col in ["ip.src", "ip.dst"]:
attack_df[col] = attack_df[col].apply(ip_to_int)
normal_df[col] = normal_df[col].apply(ip_to_int)
# Convert TCP flags (e.g., "0x12" → 18)
for df in [attack_df, normal_df]:
df["tcp.flags"] = df["tcp.flags"].apply(
lambda x: int(str(x), 16) if str(x).startswith("0x") else int(float(x))
)
# Add engineered features
for df in [attack_df, normal_df]:
df["is_well_known_port"] = (df["tcp.dstport"] < 1024).astype(int)
df["port_diff"] = abs(df["tcp.srcport"] - df["tcp.dstport"])
df["tcp_flag_score"] = df["tcp.flags"] / (df["frame.len"] + 1)
df["proto_complexity"] = (df["ip.proto"] > 100).astype(int)
df["payload_size_est"] = df["udp.length"] + df["frame.len"]
# ------------------------------------------------------------
# 🧪 Final Combined Dataset
# ------------------------------------------------------------
final_df = pd.concat([attack_df, normal_df])
final_df.fillna(0, inplace=True)
# ------------------------------------------------------------
# 💾 Save Output
# ------------------------------------------------------------
final_df.to_csv("csv/final_processed.csv", index=False)
print("✅ Dataset ready: csv/final_processed.csv")