Skip to content

Commit 8ee2da0

Browse files
committed
Data Validation part completed
1 parent 2e2268a commit 8ee2da0

10 files changed

Lines changed: 361 additions & 0 deletions

File tree

data_schema/schema.yaml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
columns:
2+
- having_IP_Address: int64
3+
- URL_Length: int64
4+
- Shortining_Service: int64
5+
- having_At_Symbol: int64
6+
- double_slash_redirecting: int64
7+
- Prefix_Suffix: int64
8+
- having_Sub_Domain: int64
9+
- SSLfinal_State: int64
10+
- Domain_registeration_length: int64
11+
- Favicon: int64
12+
- port: int64
13+
- HTTPS_token: int64
14+
- Request_URL: int64
15+
- URL_of_Anchor: int64
16+
- Links_in_tags: int64
17+
- SFH: int64
18+
- Submitting_to_email: int64
19+
- Abnormal_URL: int64
20+
- Redirect: int64
21+
- on_mouseover: int64
22+
- RightClick: int64
23+
- popUpWidnow: int64
24+
- Iframe: int64
25+
- age_of_domain: int64
26+
- DNSRecord: int64
27+
- web_traffic: int64
28+
- Page_Rank: int64
29+
- Google_Index: int64
30+
- Links_pointing_to_page: int64
31+
- Statistical_report: int64
32+
- Result: int64
33+
34+
35+
numerical_columns:
36+
- having_IP_Address
37+
- URL_Length
38+
- Shortining_Service
39+
- having_At_Symbol
40+
- double_slash_redirecting
41+
- Prefix_Suffix
42+
- having_Sub_Domain
43+
- SSLfinal_State
44+
- Domain_registeration_length
45+
- Favicon
46+
- port
47+
- HTTPS_token
48+
- Request_URL
49+
- URL_of_Anchor
50+
- Links_in_tags
51+
- SFH
52+
- Submitting_to_email
53+
- Abnormal_URL
54+
- Redirect
55+
- on_mouseover
56+
- RightClick
57+
- popUpWidnow
58+
- Iframe
59+
- age_of_domain
60+
- DNSRecord
61+
- web_traffic
62+
- Page_Rank
63+
- Google_Index
64+
- Links_pointing_to_page
65+
- Statistical_report
66+
- Result
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import sys
2+
from pathlib import Path
3+
4+
import pandas as pd
5+
from scipy.stats import ks_2samp
6+
7+
from network_security.constant.training_pipeline import SCHEMA_FILE_PATH
8+
from network_security.entity.artifact_entity import (
9+
DataIngestionArtifact,
10+
DataValidationArtifact,
11+
)
12+
from network_security.entity.config_entity import DataValidationConfig
13+
from network_security.exception.exception import NetworkSecurityException
14+
from network_security.logging.logger import logging
15+
from network_security.utils.main_utils.utils import read_yaml_file, write_yaml_file
16+
17+
18+
class DataValidation:
19+
def __init__(
20+
self,
21+
data_ingestion_artifact: DataIngestionArtifact,
22+
data_validation_config: DataValidationConfig,
23+
) -> None:
24+
try:
25+
self.data_ingestion_artifact = data_ingestion_artifact
26+
self.data_validation_config = data_validation_config
27+
self._schema_config = read_yaml_file(SCHEMA_FILE_PATH)
28+
self._numerical_columns = self._schema_config.get("numerical_columns", [])
29+
except Exception as e:
30+
raise NetworkSecurityException(e, sys)
31+
32+
@staticmethod
33+
def read_data(file_path: str) -> pd.DataFrame:
34+
try:
35+
return pd.read_csv(file_path)
36+
except Exception as e:
37+
raise NetworkSecurityException(e, sys)
38+
39+
def validate_number_of_columns(self, dataframe: pd.DataFrame) -> bool:
40+
try:
41+
number_of_columns = len(self._schema_config["columns"])
42+
logging.info(f"Required number of columns:{number_of_columns}")
43+
logging.info(f"Data frame has columns:{len(dataframe.columns)}")
44+
return len(dataframe.columns) == number_of_columns
45+
except Exception as e:
46+
raise NetworkSecurityException(e, sys)
47+
48+
def validate_numerical_columns_exist(self, dataframe: pd.DataFrame) -> bool:
49+
"""
50+
Validates whether all required numerical columns exist in the given DataFrame.
51+
52+
Returns:
53+
bool: True if all required numerical columns exist and are numeric, False otherwise.
54+
55+
"""
56+
try:
57+
required_numerical_columns = self._numerical_columns
58+
missing_columns = []
59+
non_numeric_columns = []
60+
61+
for column in required_numerical_columns:
62+
if column not in dataframe.columns:
63+
missing_columns.append(column)
64+
elif not pd.api.types.is_numeric_dtype(dataframe[column]):
65+
non_numeric_columns.append(column)
66+
67+
if missing_columns:
68+
logging.info(f"Missing numerical columns: {missing_columns}")
69+
if non_numeric_columns:
70+
logging.info(f"Columns not of numeric type: {non_numeric_columns}")
71+
72+
return len(missing_columns) == 0 and len(non_numeric_columns) == 0
73+
74+
except Exception as e:
75+
raise NetworkSecurityException(e, sys)
76+
77+
78+
def detect_dataset_drift(self, base_df: pd.DataFrame, current_df: pd.DataFrame, threshold: float = 0.05) -> bool:
79+
try:
80+
report = {}
81+
for column in base_df.columns:
82+
d1 = base_df[column]
83+
d2 = current_df[column]
84+
is_same_dist = ks_2samp(d1, d2)
85+
is_found = not threshold <= is_same_dist.pvalue
86+
report.update(
87+
{
88+
column: {
89+
"p_value": float(is_same_dist.pvalue),
90+
"drift_status": is_found,
91+
},
92+
},
93+
)
94+
drift_report_file_path = self.data_validation_config.drift_report_file_path
95+
96+
dir_path = Path(drift_report_file_path).parent
97+
dir_path.mkdir(parents=True, exist_ok=True)
98+
write_yaml_file(file_path=drift_report_file_path, content=report)
99+
write_yaml_file(file_path=drift_report_file_path, content=report)
100+
101+
except Exception as e:
102+
raise NetworkSecurityException(e, sys)
103+
104+
def initiate_data_validation(self) -> DataValidationArtifact:
105+
try:
106+
train_file_path = self.data_ingestion_artifact.trained_file_path
107+
test_file_path = self.data_ingestion_artifact.test_file_path
108+
109+
## Read the data from train and test
110+
train_dataframe = DataValidation.read_data(train_file_path)
111+
test_dataframe = DataValidation.read_data(test_file_path)
112+
113+
## Validate number of columns
114+
status = self.validate_number_of_columns(dataframe=train_dataframe)
115+
if not status:
116+
logging.info("Train dataframe does not contain all columns.\n")
117+
118+
status = self.validate_number_of_columns(dataframe=test_dataframe)
119+
if not status:
120+
logging.info("Test dataframe does not contain all columns.\n")
121+
122+
# Validate numerical columns
123+
status = self.validate_numerical_columns_exist(train_dataframe)
124+
if not status:
125+
logging.info("Train dataframe is missing required numerical columns or types.\n")
126+
127+
status = self.validate_numerical_columns_exist(test_dataframe)
128+
if not status:
129+
logging.info("Test dataframe is missing required numerical columns or types.\n")
130+
131+
## Check data drift
132+
status = self.detect_dataset_drift(
133+
base_df=train_dataframe, current_df=test_dataframe)
134+
dir_path = Path(self.data_validation_config.valid_train_file_path).parent
135+
dir_path.mkdir(parents=True, exist_ok=True)
136+
137+
train_dataframe.to_csv(
138+
self.data_validation_config.valid_train_file_path,
139+
index=False,
140+
header=True,
141+
)
142+
143+
test_dataframe.to_csv(
144+
self.data_validation_config.valid_test_file_path,
145+
index=False,
146+
header=True,
147+
)
148+
149+
data_validation_artifact = DataValidationArtifact(
150+
validation_status=status,
151+
valid_train_file_path=self.data_ingestion_artifact.trained_file_path,
152+
valid_test_file_path=self.data_ingestion_artifact.test_file_path,
153+
invalid_train_file_path=None,
154+
invalid_test_file_path=None,
155+
drift_report_file_path=self.data_validation_config.drift_report_file_path,
156+
)
157+
return data_validation_artifact
158+
except Exception as e:
159+
raise NetworkSecurityException(e, sys)

network_security/constant/training_pipeline/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,14 @@
2929
DATA_INGESTION_FEATURE_STORE_DIR: str = "feature_store"
3030
DATA_INGESTION_INGESTED_DIR: str = "ingested"
3131
DATA_INGESTION_TRAIN_TEST_SPLIT_RATION: float = 0.2
32+
33+
34+
"""
35+
Data Validation related constant start with DATA_VALIDATION VAR NAME
36+
"""
37+
DATA_VALIDATION_DIR_NAME: str = "data_validation"
38+
DATA_VALIDATION_VALID_DIR: str = "validated"
39+
DATA_VALIDATION_INVALID_DIR: str = "invalid"
40+
DATA_VALIDATION_DRIFT_REPORT_DIR: str = "drift_report"
41+
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str = "report.yaml"
42+
PREPROCESSING_OBJECT_FILE_NAME = "preprocessing.pkl"

network_security/entity/artifact_entity.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,13 @@
55
class DataIngestionArtifact:
66
trained_file_path: str
77
test_file_path: str
8+
9+
10+
@dataclass
11+
class DataValidationArtifact:
12+
validation_status: bool
13+
valid_train_file_path: str
14+
valid_test_file_path: str
15+
invalid_train_file_path: str
16+
invalid_test_file_path: str
17+
drift_report_file_path: str

network_security/entity/config_entity.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,34 @@ def __init__(self, training_pipeline_config: TrainingPipelineConfig) -> None:
4545
)
4646
self.collection_name: str = training_pipeline.DATA_INGESTION_COLLECTION_NAME
4747
self.database_name: str = training_pipeline.DATA_INGESTION_DATABASE_NAME
48+
49+
50+
class DataValidationConfig:
51+
def __init__(self, training_pipeline_config: TrainingPipelineConfig) -> None:
52+
self.data_validation_dir: Path = (
53+
Path(training_pipeline_config.artifact_dir)
54+
/ training_pipeline.DATA_VALIDATION_DIR_NAME
55+
)
56+
self.valid_data_dir: Path = (
57+
self.data_validation_dir / training_pipeline.DATA_VALIDATION_VALID_DIR
58+
)
59+
self.invalid_data_dir: Path = (
60+
self.data_validation_dir / training_pipeline.DATA_VALIDATION_INVALID_DIR
61+
)
62+
self.valid_train_file_path: Path = (
63+
self.valid_data_dir / training_pipeline.TRAIN_FILE_NAME
64+
)
65+
self.valid_test_file_path: Path = (
66+
self.valid_data_dir / training_pipeline.TEST_FILE_NAME
67+
)
68+
self.invalid_train_file_path: Path = (
69+
self.invalid_data_dir / training_pipeline.TRAIN_FILE_NAME
70+
)
71+
self.invalid_test_file_path: Path = (
72+
self.invalid_data_dir / training_pipeline.TEST_FILE_NAME
73+
)
74+
self.drift_report_file_path: Path = (
75+
self.data_validation_dir
76+
/ training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR
77+
/ training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME
78+
)

network_security/utils/main_utils/__init__.py

Whitespace-only changes.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import os
2+
3+
# import dill
4+
import sys
5+
from pathlib import Path
6+
7+
import yaml
8+
9+
from network_security.exception.exception import NetworkSecurityException
10+
from network_security.logging.logger import logging
11+
12+
13+
def read_yaml_file(file_path: str) -> dict:
14+
try:
15+
with Path(file_path).open("rb") as yaml_file:
16+
return yaml.safe_load(yaml_file)
17+
except Exception as e:
18+
raise NetworkSecurityException(e, sys) from e
19+
20+
21+
def write_yaml_file(file_path: str, content: object, replace: bool = False) -> None:
22+
try:
23+
if replace and Path(file_path).exists():
24+
Path(file_path).unlink()
25+
with Path(file_path).open("w") as file:
26+
yaml.dump(content, file)
27+
yaml.dump(content, file)
28+
except Exception as e:
29+
raise NetworkSecurityException(e, sys)

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@ readme = "README.md"
66
requires-python = ">=3.12"
77
dependencies = [
88
"certifi>=2025.6.15",
9+
"dill>=0.4.0",
910
"numpy>=2.3.0",
1011
"pandas>=2.3.0",
12+
"pyaml>=25.5.0",
1113
"pymongo[srv]==3.12",
1214
"python-dotenv>=1.1.0",
1315
"scikit-learn>=1.7.0",

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@ pymongo
55
certifi
66
pymongo[srv]==3.12
77
scikit-learn
8+
dill
9+
pyaml
810

911
# -e .

0 commit comments

Comments
 (0)