|
| 1 | +import os |
| 2 | +import sys |
| 3 | +from pathlib import Path |
| 4 | +from urllib.parse import quote_plus |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +import pandas as pd |
| 8 | +import pymongo |
| 9 | +from dotenv import load_dotenv |
| 10 | +from sklearn.model_selection import train_test_split |
| 11 | + |
| 12 | +from network_security.entity.artifact_entity import DataIngestionArtifact |
| 13 | + |
| 14 | +## Configuration of the Data Ingestion Config |
| 15 | +from network_security.entity.config_entity import DataIngestionConfig |
| 16 | +from network_security.exception.exception import NetworkSecurityException |
| 17 | +from network_security.logging.logger import logging |
| 18 | + |
| 19 | +load_dotenv() |
| 20 | + |
| 21 | +username = os.getenv("MONGO_DB_USERNAME") |
| 22 | +password = os.getenv("MONGO_DB_PASSWORD") |
| 23 | + |
| 24 | +username = quote_plus(username) |
| 25 | +password = quote_plus(password) |
| 26 | + |
| 27 | +MONGO_DB_URL: str = f"mongodb+srv://{username}:{password}@cluster0.l5ee6dv.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0" |
| 28 | + |
| 29 | + |
| 30 | +class DataIngestion: |
| 31 | + def __init__(self, data_ingestion_config: DataIngestionConfig) -> None: |
| 32 | + try: |
| 33 | + self.data_ingestion_config = data_ingestion_config |
| 34 | + except Exception as e: |
| 35 | + raise NetworkSecurityException(e, sys) |
| 36 | + |
| 37 | + def export_collection_as_dataframe(self) -> pd.DataFrame: |
| 38 | + """Read data from mongodb.""" |
| 39 | + try: |
| 40 | + database_name = self.data_ingestion_config.database_name |
| 41 | + collection_name = self.data_ingestion_config.collection_name |
| 42 | + self.mongo_client = pymongo.MongoClient(MONGO_DB_URL) |
| 43 | + collection = self.mongo_client[database_name][collection_name] |
| 44 | + |
| 45 | + df = pd.DataFrame(list(collection.find())) |
| 46 | + if "_id" in df.columns.to_list(): |
| 47 | + df = df.drop(columns=["_id"], axis=1) |
| 48 | + |
| 49 | + df.replace({"na": np.nan}, inplace=True) |
| 50 | + return df |
| 51 | + except Exception as e: |
| 52 | + raise NetworkSecurityException |
| 53 | + |
| 54 | + def export_data_into_feature_store(self, dataframe: pd.DataFrame) -> pd.DataFrame: |
| 55 | + try: |
| 56 | + feature_store_file_path = self.data_ingestion_config.feature_store_file_path |
| 57 | + dir_path = Path(feature_store_file_path).parent |
| 58 | + Path(dir_path).mkdir(parents=True, exist_ok=True) |
| 59 | + dataframe.to_csv(feature_store_file_path, index=False, header=True) |
| 60 | + return dataframe |
| 61 | + |
| 62 | + except Exception as e: |
| 63 | + raise NetworkSecurityException(e, sys) |
| 64 | + |
| 65 | + def split_data_as_train_test(self, dataframe: pd.DataFrame) -> None: |
| 66 | + try: |
| 67 | + train_set, test_set = train_test_split( |
| 68 | + dataframe, test_size=self.data_ingestion_config.train_test_split_ratio, |
| 69 | + ) |
| 70 | + logging.info("Performed train test split on the dataframe") |
| 71 | + |
| 72 | + logging.info( |
| 73 | + "Exited split_data_as_train_test method of Data_Ingestion class", |
| 74 | + ) |
| 75 | + dir_path = Path(self.data_ingestion_config.training_file_path).parent |
| 76 | + |
| 77 | + Path(dir_path).mkdir(parents=True, exist_ok=True) |
| 78 | + Path(dir_path).mkdir(parents=True, exist_ok=True) |
| 79 | + |
| 80 | + logging.info("Exporting train and test file path.") |
| 81 | + |
| 82 | + train_set.to_csv( |
| 83 | + self.data_ingestion_config.training_file_path, index=False, header=True, |
| 84 | + ) |
| 85 | + |
| 86 | + test_set.to_csv( |
| 87 | + self.data_ingestion_config.testing_file_path, index=False, header=True, |
| 88 | + ) |
| 89 | + logging.info("Exported train and test file path.") |
| 90 | + |
| 91 | + except Exception as e: |
| 92 | + raise NetworkSecurityException(e, sys) |
| 93 | + |
| 94 | + def initiate_data_ingestion(self) -> DataIngestionArtifact: |
| 95 | + try: |
| 96 | + dataframe = self.export_collection_as_dataframe() |
| 97 | + dataframe = self.export_data_into_feature_store(dataframe) |
| 98 | + self.split_data_as_train_test(dataframe) |
| 99 | + dataingestionartifact = DataIngestionArtifact( |
| 100 | + trained_file_path=self.data_ingestion_config.training_file_path, |
| 101 | + test_file_path=self.data_ingestion_config.testing_file_path, |
| 102 | + ) |
| 103 | + return dataingestionartifact |
| 104 | + |
| 105 | + except Exception as e: |
| 106 | + raise NetworkSecurityException |
0 commit comments