Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cdisc_rules_engine/interfaces/data_reader_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,19 @@ class DataReaderInterface:
"""

def __init__(
self, dataset_implementation=PandasDataset, encoding: str = DEFAULT_ENCODING
self,
dataset_implementation=PandasDataset,
encoding: str = DEFAULT_ENCODING,
variables_csv_path: str = None,
):
"""
:param DatasetInterface dataset_implementation : The dataset type to return.
:param str encoding : The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8).
:param str variables_csv_path : Optional path to a `_variables.csv` declaring variable metadata
"""
self.dataset_implementation = dataset_implementation
self.encoding = encoding
self.variables_csv_path = variables_csv_path

def read(self, data):
"""
Expand Down
63 changes: 54 additions & 9 deletions cdisc_rules_engine/services/data_readers/csv_reader.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,78 @@
import tempfile
from pathlib import Path

import dask.dataframe as dd
from numpy import nan

from cdisc_rules_engine.exceptions.custom_exceptions import InvalidCSVFile
from cdisc_rules_engine.interfaces import DataReaderInterface
import pandas as pd

from cdisc_rules_engine.models.dataset import PandasDataset, DaskDataset


class CSVReader(DataReaderInterface):
dtype_mapping = {
"Char": str,
"Num": float,
"Boolean": "boolean",
"Number": float,
"String": str,
}

def read(self, data):
"""
Function for reading data from a specific file type and returning a
pandas dataframe of the data.
"""
raise NotImplementedError

def _get_declared_dtypes(self, file_path: str) -> dict:
variables_csv_path = (
Path(self.variables_csv_path)
if self.variables_csv_path
else Path(file_path).parent / "_variables.csv"
)
try:
meta_df = pd.read_csv(variables_csv_path, encoding=self.encoding)
except (UnicodeDecodeError, UnicodeError) as e:
raise InvalidCSVFile(
f"\n Error reading variables metadata from: {variables_csv_path}"
f"\n Failed to decode with {self.encoding} encoding: {e}"
f"\n Please specify the correct encoding using the -e flag."
)
except Exception as e:
raise InvalidCSVFile(
f"\n Error reading variables metadata from: {variables_csv_path}"
f"\n {type(e).__name__}: {e}"
)
dataset_name = Path(file_path).stem.lower()
meta_df["dataset"] = meta_df["dataset"].apply(
lambda x: Path(str(x)).stem.lower()
)
dataset_meta_df = meta_df[meta_df["dataset"] == dataset_name]
if dataset_meta_df.empty:
return {}
return {
row["variable"]: self.dtype_mapping.get(row["type"], str)
for _, row in dataset_meta_df.iterrows()
}

def from_file(self, file_path):
try:
declared_dtypes = self._get_declared_dtypes(file_path)
with open(file_path, "r", encoding=self.encoding) as fp:
data = pd.read_csv(fp, sep=",", header=0, index_col=False)
data = data.where(data.notna(), None)
data = pd.read_csv(
fp,
sep=",",
header=0,
index_col=False,
dtype=declared_dtypes or None,
na_values=[""],
keep_default_na=False,
true_values=["True", "TRUE", "true", "1"],
false_values=["False", "FALSE", "false", "0"],
)
data = data.replace({nan: None})
if self.dataset_implementation == PandasDataset:
return PandasDataset(data)
else:
Expand All @@ -42,23 +93,17 @@ def from_file(self, file_path):

def to_parquet(self, file_path: str) -> tuple[int, str]:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet")

dataset = pd.read_csv(file_path, chunksize=20000, encoding=self.encoding)

created = False
num_rows = 0

for chunk in dataset:
num_rows += len(chunk)

if not created:
chunk.to_parquet(temp_file.name, engine="fastparquet")
created = True
else:
chunk.to_parquet(temp_file.name, engine="fastparquet", append=True)

if not created:
empty_df = pd.read_csv(file_path, nrows=0, encoding=self.encoding)
empty_df.to_parquet(temp_file.name, engine="fastparquet")

return num_rows, temp_file.name
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@ def __init__(
service_name: str = None,
dataset_implementation=PandasDataset,
encoding: str = None,
variables_csv_path: str = None,
):
self._default_service_name = service_name
self.dataset_implementation = dataset_implementation
self.encoding = encoding
self.variables_csv_path = variables_csv_path

@classmethod
def register_service(cls, name: str, service: Type[DataReaderInterface]):
Expand All @@ -58,7 +60,11 @@ def get_service(self, name: str = None, **kwargs) -> DataReaderInterface:
if service_name in self._reader_map:
reader_class = self._reader_map[service_name]
encoding = self.encoding or DEFAULT_ENCODING
return reader_class(self.dataset_implementation, encoding=encoding)
return reader_class(
self.dataset_implementation,
encoding=encoding,
variables_csv_path=self.variables_csv_path,
)
raise ValueError(
f"Service name must be in {list(self._reader_map.keys())}, "
f"given service name is {service_name}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def get_instance(
"dataset_implementation", PandasDataset
),
encoding=encoding,
variables_csv_path=kwargs.get("variables_csv_path"),
),
config=config,
**kwargs,
Expand Down
Loading
Loading