Skip to content

Commit 6184871

Browse files
committed
Data Ingestion part completed
1 parent 72268a1 commit 6184871

8 files changed

Lines changed: 285 additions & 3 deletions

File tree

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,7 @@ wheels/
1616
.DS_Store
1717

1818
# Log files
19-
logs/
19+
logs/
20+
21+
# Artifacts
22+
Artifacts/
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import os
2+
import sys
3+
from pathlib import Path
4+
from urllib.parse import quote_plus
5+
6+
import numpy as np
7+
import pandas as pd
8+
import pymongo
9+
from dotenv import load_dotenv
10+
from sklearn.model_selection import train_test_split
11+
12+
from network_security.entity.artifact_entity import DataIngestionArtifact
13+
14+
## Configuration of the Data Ingestion Config
15+
from network_security.entity.config_entity import DataIngestionConfig
16+
from network_security.exception.exception import NetworkSecurityException
17+
from network_security.logging.logger import logging
18+
19+
load_dotenv()
20+
21+
username = os.getenv("MONGO_DB_USERNAME")
22+
password = os.getenv("MONGO_DB_PASSWORD")
23+
24+
username = quote_plus(username)
25+
password = quote_plus(password)
26+
27+
MONGO_DB_URL: str = f"mongodb+srv://{username}:{password}@cluster0.l5ee6dv.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
28+
29+
30+
class DataIngestion:
31+
def __init__(self, data_ingestion_config: DataIngestionConfig) -> None:
32+
try:
33+
self.data_ingestion_config = data_ingestion_config
34+
except Exception as e:
35+
raise NetworkSecurityException(e, sys)
36+
37+
def export_collection_as_dataframe(self) -> pd.DataFrame:
38+
"""Read data from mongodb."""
39+
try:
40+
database_name = self.data_ingestion_config.database_name
41+
collection_name = self.data_ingestion_config.collection_name
42+
self.mongo_client = pymongo.MongoClient(MONGO_DB_URL)
43+
collection = self.mongo_client[database_name][collection_name]
44+
45+
df = pd.DataFrame(list(collection.find()))
46+
if "_id" in df.columns.to_list():
47+
df = df.drop(columns=["_id"], axis=1)
48+
49+
df.replace({"na": np.nan}, inplace=True)
50+
return df
51+
except Exception as e:
52+
raise NetworkSecurityException
53+
54+
def export_data_into_feature_store(self, dataframe: pd.DataFrame) -> pd.DataFrame:
55+
try:
56+
feature_store_file_path = self.data_ingestion_config.feature_store_file_path
57+
dir_path = Path(feature_store_file_path).parent
58+
Path(dir_path).mkdir(parents=True, exist_ok=True)
59+
dataframe.to_csv(feature_store_file_path, index=False, header=True)
60+
return dataframe
61+
62+
except Exception as e:
63+
raise NetworkSecurityException(e, sys)
64+
65+
def split_data_as_train_test(self, dataframe: pd.DataFrame) -> None:
66+
try:
67+
train_set, test_set = train_test_split(
68+
dataframe, test_size=self.data_ingestion_config.train_test_split_ratio,
69+
)
70+
logging.info("Performed train test split on the dataframe")
71+
72+
logging.info(
73+
"Exited split_data_as_train_test method of Data_Ingestion class",
74+
)
75+
dir_path = Path(self.data_ingestion_config.training_file_path).parent
76+
77+
Path(dir_path).mkdir(parents=True, exist_ok=True)
78+
Path(dir_path).mkdir(parents=True, exist_ok=True)
79+
80+
logging.info("Exporting train and test file path.")
81+
82+
train_set.to_csv(
83+
self.data_ingestion_config.training_file_path, index=False, header=True,
84+
)
85+
86+
test_set.to_csv(
87+
self.data_ingestion_config.testing_file_path, index=False, header=True,
88+
)
89+
logging.info("Exported train and test file path.")
90+
91+
except Exception as e:
92+
raise NetworkSecurityException(e, sys)
93+
94+
def initiate_data_ingestion(self) -> DataIngestionArtifact:
95+
try:
96+
dataframe = self.export_collection_as_dataframe()
97+
dataframe = self.export_data_into_feature_store(dataframe)
98+
self.split_data_as_train_test(dataframe)
99+
dataingestionartifact = DataIngestionArtifact(
100+
trained_file_path=self.data_ingestion_config.training_file_path,
101+
test_file_path=self.data_ingestion_config.testing_file_path,
102+
)
103+
return dataingestionartifact
104+
105+
except Exception as e:
106+
raise NetworkSecurityException
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import os
2+
import sys
3+
from pathlib import Path
4+
5+
import numpy as np
6+
import pandas as pd
7+
8+
"""
9+
defining common constant variable for training pipeline
10+
"""
11+
TARGET_COLUMN = "Result"
12+
PIPELINE_NAME: str = "NetworkSecurity"
13+
ARTIFACT_DIR: str = "Artifacts"
14+
FILE_NAME: str = "phisingData.csv"
15+
16+
TRAIN_FILE_NAME: str = "train.csv"
17+
TEST_FILE_NAME: str = "test.csv"
18+
SCHEMA_FILE_PATH = Path("data_schema") / "schema.yaml"
19+
SAVED_MODEL_DIR = Path("saved_models")
20+
MODEL_FILE_NAME = "model.pkl"
21+
22+
23+
"""
24+
Data Ingestion related constant start with DATA_INGESTION VAR NAME
25+
"""
26+
DATA_INGESTION_COLLECTION_NAME: str = "NetworkData"
27+
DATA_INGESTION_DATABASE_NAME: str = "TEST_DB"
28+
DATA_INGESTION_DIR_NAME: str = "data_ingestion"
29+
DATA_INGESTION_FEATURE_STORE_DIR: str = "feature_store"
30+
DATA_INGESTION_INGESTED_DIR: str = "ingested"
31+
DATA_INGESTION_TRAIN_TEST_SPLIT_RATION: float = 0.2
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from dataclasses import dataclass
2+
3+
4+
@dataclass
5+
class DataIngestionArtifact:
6+
trained_file_path: str
7+
test_file_path: str
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from datetime import datetime
2+
from pathlib import Path
3+
4+
from network_security.constant import training_pipeline
5+
6+
print(training_pipeline.PIPELINE_NAME)
7+
print(training_pipeline.ARTIFACT_DIR)
8+
9+
10+
class TrainingPipelineConfig:
11+
def __init__(self, timestamp: datetime = None) -> None:
12+
if timestamp is None:
13+
timestamp = datetime.now().astimezone()
14+
timestamp_str = timestamp.strftime("%m_%d_%Y_%H_%M_%S")
15+
self.pipeline_name = training_pipeline.PIPELINE_NAME
16+
self.artifact_name = training_pipeline.ARTIFACT_DIR
17+
self.artifact_dir = Path(self.artifact_name) / timestamp_str
18+
self.model_dir = Path("final_model")
19+
self.timestamp: str = timestamp_str
20+
21+
22+
class DataIngestionConfig:
23+
def __init__(self, training_pipeline_config: TrainingPipelineConfig) -> None:
24+
self.data_ingestion_dir: Path = (
25+
Path(training_pipeline_config.artifact_dir)
26+
/ training_pipeline.DATA_INGESTION_DIR_NAME
27+
)
28+
self.feature_store_file_path: Path = (
29+
self.data_ingestion_dir
30+
/ training_pipeline.DATA_INGESTION_FEATURE_STORE_DIR
31+
/ training_pipeline.FILE_NAME
32+
)
33+
self.training_file_path: Path = (
34+
self.data_ingestion_dir
35+
/ training_pipeline.DATA_INGESTION_INGESTED_DIR
36+
/ training_pipeline.TRAIN_FILE_NAME
37+
)
38+
self.testing_file_path: Path = (
39+
self.data_ingestion_dir
40+
/ training_pipeline.DATA_INGESTION_INGESTED_DIR
41+
/ training_pipeline.TEST_FILE_NAME
42+
)
43+
self.train_test_split_ratio: float = (
44+
training_pipeline.DATA_INGESTION_TRAIN_TEST_SPLIT_RATION
45+
)
46+
self.collection_name: str = training_pipeline.DATA_INGESTION_COLLECTION_NAME
47+
self.database_name: str = training_pipeline.DATA_INGESTION_DATABASE_NAME

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ dependencies = [
88
"certifi>=2025.6.15",
99
"numpy>=2.3.0",
1010
"pandas>=2.3.0",
11-
"pymongo[srv]>=3.12.0",
11+
"pymongo[srv]==3.12",
1212
"python-dotenv>=1.1.0",
13+
"scikit-learn>=1.7.0",
1314
"setuptools>=80.9.0",
1415
]

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@ numpy
44
pymongo
55
certifi
66
pymongo[srv]==3.12
7+
scikit-learn
78

89
# -e .

0 commit comments

Comments
 (0)