From 8ffbf6e6dfdfcc685c0e6215a36fd1e331917b04 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Thu, 18 Jun 2026 17:26:20 -0400 Subject: [PATCH 1/2] tests --- .../interfaces/data_reader_interface.py | 7 +- .../services/data_readers/csv_reader.py | 63 ++++- .../data_readers/data_reader_factory.py | 8 +- .../data_services/local_data_service.py | 1 + tests/unit/test_csv_reader.py | 220 +++++++++++++++++- 5 files changed, 287 insertions(+), 12 deletions(-) diff --git a/cdisc_rules_engine/interfaces/data_reader_interface.py b/cdisc_rules_engine/interfaces/data_reader_interface.py index 903d1b511..19283f0db 100644 --- a/cdisc_rules_engine/interfaces/data_reader_interface.py +++ b/cdisc_rules_engine/interfaces/data_reader_interface.py @@ -8,14 +8,19 @@ class DataReaderInterface: """ def __init__( - self, dataset_implementation=PandasDataset, encoding: str = DEFAULT_ENCODING + self, + dataset_implementation=PandasDataset, + encoding: str = DEFAULT_ENCODING, + variables_csv_path: str = None, ): """ :param DatasetInterface dataset_implementation : The dataset type to return. :param str encoding : The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8). + :param str variables_csv_path : Optional path to a `_variables.csv` declaring variable metadata """ self.dataset_implementation = dataset_implementation self.encoding = encoding + self.variables_csv_path = variables_csv_path def read(self, data): """ diff --git a/cdisc_rules_engine/services/data_readers/csv_reader.py b/cdisc_rules_engine/services/data_readers/csv_reader.py index 901e6d695..f45566518 100644 --- a/cdisc_rules_engine/services/data_readers/csv_reader.py +++ b/cdisc_rules_engine/services/data_readers/csv_reader.py @@ -1,15 +1,24 @@ import tempfile +from pathlib import Path import dask.dataframe as dd +from numpy import nan from cdisc_rules_engine.exceptions.custom_exceptions import InvalidCSVFile from cdisc_rules_engine.interfaces import DataReaderInterface import pandas as pd - from cdisc_rules_engine.models.dataset import PandasDataset, DaskDataset class CSVReader(DataReaderInterface): + dtype_mapping = { + "Char": str, + "Num": float, + "Boolean": "boolean", + "Number": float, + "String": str, + } + def read(self, data): """ Function for reading data from a specific file type and returning a @@ -17,11 +26,53 @@ def read(self, data): """ raise NotImplementedError + def _get_declared_dtypes(self, file_path: str) -> dict: + variables_csv_path = ( + Path(self.variables_csv_path) + if self.variables_csv_path + else Path(file_path).parent / "_variables.csv" + ) + try: + meta_df = pd.read_csv(variables_csv_path, encoding=self.encoding) + except (UnicodeDecodeError, UnicodeError) as e: + raise InvalidCSVFile( + f"\n Error reading variables metadata from: {variables_csv_path}" + f"\n Failed to decode with {self.encoding} encoding: {e}" + f"\n Please specify the correct encoding using the -e flag." + ) + except Exception as e: + raise InvalidCSVFile( + f"\n Error reading variables metadata from: {variables_csv_path}" + f"\n {type(e).__name__}: {e}" + ) + dataset_name = Path(file_path).stem.lower() + meta_df["dataset"] = meta_df["dataset"].apply( + lambda x: Path(str(x)).stem.lower() + ) + dataset_meta_df = meta_df[meta_df["dataset"] == dataset_name] + if dataset_meta_df.empty: + return {} + return { + row["variable"]: self.dtype_mapping.get(row["type"], str) + for _, row in dataset_meta_df.iterrows() + } + def from_file(self, file_path): try: + declared_dtypes = self._get_declared_dtypes(file_path) with open(file_path, "r", encoding=self.encoding) as fp: - data = pd.read_csv(fp, sep=",", header=0, index_col=False) - data = data.where(data.notna(), None) + data = pd.read_csv( + fp, + sep=",", + header=0, + index_col=False, + dtype=declared_dtypes or None, + na_values=[""], + keep_default_na=False, + true_values=["True", "TRUE", "true", "1"], + false_values=["False", "FALSE", "false", "0"], + ) + data = data.replace({nan: None}) if self.dataset_implementation == PandasDataset: return PandasDataset(data) else: @@ -42,23 +93,17 @@ def from_file(self, file_path): def to_parquet(self, file_path: str) -> tuple[int, str]: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") - dataset = pd.read_csv(file_path, chunksize=20000, encoding=self.encoding) - created = False num_rows = 0 - for chunk in dataset: num_rows += len(chunk) - if not created: chunk.to_parquet(temp_file.name, engine="fastparquet") created = True else: chunk.to_parquet(temp_file.name, engine="fastparquet", append=True) - if not created: empty_df = pd.read_csv(file_path, nrows=0, encoding=self.encoding) empty_df.to_parquet(temp_file.name, engine="fastparquet") - return num_rows, temp_file.name diff --git a/cdisc_rules_engine/services/data_readers/data_reader_factory.py b/cdisc_rules_engine/services/data_readers/data_reader_factory.py index 6f835e789..38fca02f9 100644 --- a/cdisc_rules_engine/services/data_readers/data_reader_factory.py +++ b/cdisc_rules_engine/services/data_readers/data_reader_factory.py @@ -34,10 +34,12 @@ def __init__( service_name: str = None, dataset_implementation=PandasDataset, encoding: str = None, + variables_csv_path: str = None, ): self._default_service_name = service_name self.dataset_implementation = dataset_implementation self.encoding = encoding + self.variables_csv_path = variables_csv_path @classmethod def register_service(cls, name: str, service: Type[DataReaderInterface]): @@ -58,7 +60,11 @@ def get_service(self, name: str = None, **kwargs) -> DataReaderInterface: if service_name in self._reader_map: reader_class = self._reader_map[service_name] encoding = self.encoding or DEFAULT_ENCODING - return reader_class(self.dataset_implementation, encoding=encoding) + return reader_class( + self.dataset_implementation, + encoding=encoding, + variables_csv_path=self.variables_csv_path, + ) raise ValueError( f"Service name must be in {list(self._reader_map.keys())}, " f"given service name is {service_name}" diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index c9878b0e7..18c6fd41a 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -112,6 +112,7 @@ def get_instance( "dataset_implementation", PandasDataset ), encoding=encoding, + variables_csv_path=kwargs.get("variables_csv_path"), ), config=config, **kwargs, diff --git a/tests/unit/test_csv_reader.py b/tests/unit/test_csv_reader.py index b30d98539..c2888325f 100644 --- a/tests/unit/test_csv_reader.py +++ b/tests/unit/test_csv_reader.py @@ -1,4 +1,5 @@ import logging +import os import tempfile import textwrap from datetime import datetime @@ -351,6 +352,14 @@ def test_empty_data_file_returns_empty_first_record(self): assert result["first_record"] == {} +CSV_READER_VARIABLES_CSV = textwrap.dedent("""\ + dataset,variable,label,type,length + data,id,ID,Num,10 + data,name,Name,Char,50 + data,age,Age,Num,8 +""") + + class TestCSVReaderFromFile: """Tests for CSVReader.from_file()""" @@ -358,8 +367,9 @@ def setup_method(self): self.tmpdir = tempfile.mkdtemp() self.csv_path = Path(self.tmpdir) / "data.csv" _write(self.csv_path, DATA_CSV) + _write(Path(self.tmpdir) / "_variables.csv", CSV_READER_VARIABLES_CSV) - def test_returns_dataframe(self): + def test_returns_pandas_dataset(self): reader = CSVReader() df = reader.from_file(str(self.csv_path)) assert isinstance(df, PandasDataset) @@ -394,6 +404,214 @@ def test_empty_csv_returns_empty_dataframe(self): assert df.empty assert list(df.columns) == ["id", "name", "age"] + def test_num_column_no_blanks_values_are_float(self): + reader = CSVReader() + df = reader.from_file(str(self.csv_path)) + assert all(isinstance(v, float) for v in df["age"] if v is not None) + + def test_num_column_no_blanks_correct_values(self): + reader = CSVReader() + df = reader.from_file(str(self.csv_path)) + assert df.iloc[0]["age"] == 30 + assert df.iloc[1]["age"] == 25 + assert df.iloc[2]["age"] == 40 + + def test_num_column_with_blanks_non_null_are_float(self): + csv = textwrap.dedent("""\ + id,name,age + 1,Alice,30 + 2,Bob, + 3,Carol,40 + """) + variables = textwrap.dedent("""\ + dataset,variable,label,type,length + blanks,id,ID,Num,10 + blanks,name,Name,Char,50 + blanks,age,Age,Num,8 + """) + path = Path(self.tmpdir) / "blanks.csv" + _write(path, csv) + _write(Path(self.tmpdir) / "_variables.csv", variables) + reader = CSVReader() + df = reader.from_file(str(path)) + non_null = [v for v in df["age"] if v is not None] + assert all(isinstance(v, float) for v in non_null) + + def test_num_column_with_blanks_missing_is_none(self): + csv = textwrap.dedent("""\ + id,name,age + 1,Alice,30 + 2,Bob, + 3,Carol,40 + """) + variables = textwrap.dedent("""\ + dataset,variable,label,type,length + blanks,id,ID,Num,10 + blanks,name,Name,Char,50 + blanks,age,Age,Num,8 + """) + path = Path(self.tmpdir) / "blanks.csv" + _write(path, csv) + _write(Path(self.tmpdir) / "_variables.csv", variables) + reader = CSVReader() + df = reader.from_file(str(path)) + assert df.iloc[1]["age"] is None + + def test_num_column_with_decimals_stays_float(self): + csv = textwrap.dedent("""\ + id,name,score + 1,Alice,10.5 + 2,Bob,20.0 + 3,Carol,30.8 + """) + variables = textwrap.dedent("""\ + dataset,variable,label,type,length + decimals,id,ID,Num,10 + decimals,name,Name,Char,50 + decimals,score,Score,Num,8 + """) + path = Path(self.tmpdir) / "decimals.csv" + _write(path, csv) + _write(Path(self.tmpdir) / "_variables.csv", variables) + reader = CSVReader() + df = reader.from_file(str(path)) + assert all(isinstance(v, float) for v in df["score"] if v is not None) + + def test_char_column_with_numeric_looking_values_stays_str(self): + csv = textwrap.dedent("""\ + code,name + 001,Alice + 002,Bob + 003,Carol + """) + variables = textwrap.dedent("""\ + dataset,variable,label,type,length + codes,code,Code,Char,3 + codes,name,Name,Char,50 + """) + path = Path(self.tmpdir) / "codes.csv" + _write(path, csv) + _write(Path(self.tmpdir) / "_variables.csv", variables) + reader = CSVReader() + df = reader.from_file(str(path)) + assert all(isinstance(v, str) for v in df["code"]) + assert df.iloc[0]["code"] == "001" + + def test_char_blank_cell_becomes_none(self): + csv = textwrap.dedent("""\ + id,status + 1,ACTIVE + 2, + 3,INACTIVE + """) + variables = textwrap.dedent("""\ + dataset,variable,label,type,length + status,id,ID,Num,10 + status,status,Status,Char,8 + """) + path = Path(self.tmpdir) / "status.csv" + _write(path, csv) + _write(Path(self.tmpdir) / "_variables.csv", variables) + reader = CSVReader() + df = reader.from_file(str(path)) + assert df.iloc[1]["status"] is None + + def test_char_literal_na_not_converted_to_none(self): + csv = textwrap.dedent("""\ + id,status + 1,ACTIVE + 2,NA + 3,INACTIVE + """) + variables = textwrap.dedent("""\ + dataset,variable,label,type,length + status,id,ID,Num,10 + status,status,Status,Char,8 + """) + path = Path(self.tmpdir) / "status.csv" + _write(path, csv) + _write(Path(self.tmpdir) / "_variables.csv", variables) + reader = CSVReader() + df = reader.from_file(str(path)) + assert df.iloc[1]["status"] == "NA" + + def test_boolean_column_true_false_strings(self): + csv = textwrap.dedent("""\ + id,active + 1,True + 2,False + 3,TRUE + """) + variables = textwrap.dedent("""\ + dataset,variable,label,type,length + bools,id,ID,Num,10 + bools,active,Active,Boolean,1 + """) + path = Path(self.tmpdir) / "bools.csv" + _write(path, csv) + _write(Path(self.tmpdir) / "_variables.csv", variables) + reader = CSVReader() + df = reader.from_file(str(path)) + assert df.iloc[0]["active"] is True + assert df.iloc[1]["active"] is False + + def test_boolean_column_blank_becomes_none(self): + csv = textwrap.dedent("""\ + id,active + 1,True + 2, + 3,False + """) + variables = textwrap.dedent("""\ + dataset,variable,label,type,length + bools,id,ID,Num,10 + bools,active,Active,Boolean,1 + """) + path = Path(self.tmpdir) / "bools.csv" + _write(path, csv) + _write(Path(self.tmpdir) / "_variables.csv", variables) + reader = CSVReader() + df = reader.from_file(str(path)) + assert df.iloc[1]["active"] is None + + def test_dataset_not_in_variables_csv_falls_back_to_inference(self): + """If _variables.csv exists but has no entry for this dataset, + pandas inference is used and no error is raised.""" + _write( + Path(self.tmpdir) / "_variables.csv", + textwrap.dedent("""\ + dataset,variable,label,type,length + other,id,ID,Num,10 + """), + ) + reader = CSVReader() + df = reader.from_file(str(self.csv_path)) + assert isinstance(df, PandasDataset) + assert list(df.columns) == ["id", "name", "age"] + + def test_missing_variables_csv_raises_invalid_csv_file(self): + vars_path = Path(self.tmpdir) / "_variables.csv" + if vars_path.exists(): + os.remove(vars_path) + reader = CSVReader() + with pytest.raises(InvalidCSVFile): + reader.from_file(str(self.csv_path)) + + def test_encoding_error_in_variables_csv_raises_invalid_csv_file(self): + vars_path = Path(self.tmpdir) / "_variables.csv" + vars_path.write_bytes( + b"dataset,variable,label,type,length\ndata,id,caf\xe9,Num,10\n" + ) + reader = CSVReader(encoding="utf-8") + with pytest.raises(InvalidCSVFile, match="_variables.csv"): + reader.from_file(str(self.csv_path)) + + def test_encoding_error_in_data_file_raises_invalid_csv_file(self): + self.csv_path.write_bytes(b"id,name,age\n1,Caf\xe9,30\n") + reader = CSVReader(encoding="utf-8") + with pytest.raises(InvalidCSVFile, match="data.csv"): + reader.from_file(str(self.csv_path)) + class TestCSVReaderToParquet: From b748fab6e00fdfdf11d6255e958c56f069a1c548 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Thu, 18 Jun 2026 17:39:11 -0400 Subject: [PATCH 2/2] update assertion --- tests/unit/test_csv_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_csv_reader.py b/tests/unit/test_csv_reader.py index c2888325f..46ce37e70 100644 --- a/tests/unit/test_csv_reader.py +++ b/tests/unit/test_csv_reader.py @@ -552,8 +552,8 @@ def test_boolean_column_true_false_strings(self): _write(Path(self.tmpdir) / "_variables.csv", variables) reader = CSVReader() df = reader.from_file(str(path)) - assert df.iloc[0]["active"] is True - assert df.iloc[1]["active"] is False + assert df.iloc[0]["active"] + assert not df.iloc[1]["active"] def test_boolean_column_blank_becomes_none(self): csv = textwrap.dedent("""\