diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml index 4c7f70fe0..f455d0b78 100644 --- a/.github/workflows/lint-format.yml +++ b/.github/workflows/lint-format.yml @@ -49,7 +49,7 @@ jobs: python-version: "3.12" - name: Install linters run: | - pip install black flake8 -c requirements-dev.txt + pip install black flake8 - name: Run flake8 run: | flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics diff --git a/.github/workflows/test-unit.yml b/.github/workflows/test-unit.yml index 38a9e55ae..d11d78b30 100644 --- a/.github/workflows/test-unit.yml +++ b/.github/workflows/test-unit.yml @@ -16,8 +16,8 @@ jobs: python-version: "3.12" - name: Install requirements run: | - pip install -r requirements-dev.txt - pip install -e . + pip install -r requirements.txt + pip install --group dev -e . - name: Running Tests env: CDISC_LIBRARY_API_KEY: fakekey12341234 diff --git a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py index fb2374b9c..328a90e48 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py @@ -66,7 +66,7 @@ def build(self): data_contents_with_vlm["variable_value_length"] = data_contents_with_vlm.data[ ["variable_value", "define_vlm_data_type"] ].apply( - lambda row: self.calculate_variable_value_length( + lambda row: ValuesDatasetBuilder.calculate_variable_value_length( row["variable_value"], row["define_vlm_data_type"] ), axis=1, diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index f966d1d33..2609335cd 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -81,9 +81,9 @@ def __setitem__(self, key, value): array_values = da.from_array(value, chunks=tuple(chunks)) self._data[key] = array_values elif isinstance(value, pd.Series): - self._data = self._data.reset_index() - self._data = self._data.set_index("index") - self._data[key] = value + pdf = self._data.compute() + pdf[key] = value.values + self._data = dd.from_pandas(pdf, npartitions=1) elif isinstance(value, dd.DataFrame): for column in value: self._data[column] = value[column] diff --git a/cdisc_rules_engine/operations/codelist_extensible.py b/cdisc_rules_engine/operations/codelist_extensible.py index 62cd7b56f..cc3ab7042 100644 --- a/cdisc_rules_engine/operations/codelist_extensible.py +++ b/cdisc_rules_engine/operations/codelist_extensible.py @@ -32,6 +32,10 @@ def _handle_multiple_versions(self) -> pd.Series: "codelist_code": "string", } ) + cast_cols = {self.params.ct_version: "string"} + if self.params.codelist_code in self.evaluation_dataset.columns: + cast_cols[self.params.codelist_code] = "string" + self.evaluation_dataset = self.evaluation_dataset.astype(cast_cols) if self.params.codelist_code in self.evaluation_dataset.columns: is_extensible = self.evaluation_dataset.merge( ct_df.data, diff --git a/cdisc_rules_engine/operations/codelist_terms.py b/cdisc_rules_engine/operations/codelist_terms.py index 90a0a6d18..5ddcf21ef 100644 --- a/cdisc_rules_engine/operations/codelist_terms.py +++ b/cdisc_rules_engine/operations/codelist_terms.py @@ -64,6 +64,10 @@ def _handle_multiple_versions(self) -> pd.Series: "codelist_code": "string", } ) + cast_cols = {self.params.ct_version: "string"} + if self.params.codelist_code in self.evaluation_dataset.columns: + cast_cols[self.params.codelist_code] = "string" + self.evaluation_dataset = self.evaluation_dataset.astype(cast_cols) if self.params.codelist_code in self.evaluation_dataset.columns: result = self.evaluation_dataset.merge( ct_df.data, diff --git a/cdisc_rules_engine/services/data_readers/csv_reader.py b/cdisc_rules_engine/services/data_readers/csv_reader.py index ae8aa90d0..901e6d695 100644 --- a/cdisc_rules_engine/services/data_readers/csv_reader.py +++ b/cdisc_rules_engine/services/data_readers/csv_reader.py @@ -1,6 +1,6 @@ import tempfile -from dask.dataframe import dd +import dask.dataframe as dd from cdisc_rules_engine.exceptions.custom_exceptions import InvalidCSVFile from cdisc_rules_engine.interfaces import DataReaderInterface diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 34ebe2285..f3175bdd3 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -399,8 +399,19 @@ def __read_node_metadata( } @staticmethod - def __get_full_path(node: DatumInContext): - return f"{node.full_path}".replace(".[", "[") + def __get_full_path(node: DatumInContext) -> str: + parts = [] + current = node + while current is not None and current.context is not None: + parts.append(str(current.path)) + current = current.context + result = "" + for part in reversed(parts): + if part.startswith("["): + result += part + else: + result = (result + "." if result else "") + part + return result def __get_datasets_content_index(self) -> List[dict]: """ diff --git a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py index f20506c91..b96dc4540 100644 --- a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py @@ -61,7 +61,7 @@ def read(self) -> dict: "variable_labels": list(metadata.column_labels), "variable_names": list(metadata.column_names), "variable_formats": [ - "" if data_type == "NULL" else data_type + "" if (data_type == "NULL" or data_type is None) else data_type for data_type in metadata.original_variable_types.values() ], "variable_name_to_label_map": metadata.column_names_to_labels, diff --git a/core.py b/core.py index 98f0a5a70..ea660b867 100644 --- a/core.py +++ b/core.py @@ -357,7 +357,6 @@ def load_custom_dotenv_from_data_options(_ctx, _param, value): "-s", "--standard", required=True, - default=None, help="CDISC standard to validate against", envvar="PRODUCT", ) @@ -365,7 +364,6 @@ def load_custom_dotenv_from_data_options(_ctx, _param, value): "-v", "--version", required=True, - default=None, help="Standard version to validate against", envvar="VERSION", ) diff --git a/docs/contributing.md b/docs/contributing.md index d53a3315e..f087b60b9 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -34,7 +34,7 @@ This project enforces consistent formatting and linting via pre-commit hooks. - [`flake8`](https://flake8.pycqa.org/) — Python linter - [`prettier`](https://prettier.io/) — JSON, YAML, and Markdown formatter -Both `black` and `flake8` are included in `requirements-dev.txt`. After installing dependencies, install the pre-commit hooks: +Both `black` and `flake8` are included in the `dev` dependency group in `pyproject.toml`. After installing dependencies, install the pre-commit hooks: ```bash pre-commit install diff --git a/docs/development.md b/docs/development.md index bb4c27427..f39f2cccc 100644 --- a/docs/development.md +++ b/docs/development.md @@ -47,7 +47,7 @@ source venv/bin/activate .\venv\Scripts\Activate # Install dependencies -python -m pip install -r requirements-dev.txt +pip install -e . && pip install --group dev ``` --- diff --git a/pyproject.toml b/pyproject.toml index bb4d69066..7418ff68c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,49 @@ build-backend = "setuptools.build_meta" [project] name = "cdisc-rules-engine" -dynamic = ["version", "dependencies"] +dynamic = ["version"] description = "Open source offering of the cdisc rules engine" readme = "docs/PYPI.md" requires-python = ">=3.12, <3.13" license = { text = "MIT" } authors = [{ name = "cdisc-org", email = "info@cdisc.org" }] +dependencies = [ + "business_rules_enhanced >=1.4.8", + "cachetools >=6.1.0", + "cdisc-library-client >=0.1.6", + "click >=8.1.7", + "dask[dataframe,array] >=2024.6.0", + "fastparquet >=2024.2.0", + "importlib-metadata >=8.5.0", + "jsonata-python >=0.6.0", + "jsonpath-ng >=1.6.1", + "jsonschema >=4.18.5", + "lxml >=5.2.1", + "numpy >=1.26.0", + "odmlib >=0.1.4", + "openpyxl >=3.1.5", + "pandas >=2.1.4, <3.0.0", + "psutil >=6.1.1", + "pyinstaller >=6.11.0", + "pympler >=1.1", + "pyreadstat >=1.2.7", + "python-dotenv >=1.0.0", + "pyyaml >=6.0.2", + "redis >=4.5.0", + "requests >=2.32.3", + "setuptools >=75.6.0", + "titlecase >=2.4.1", +] + +[dependency-groups] +dev = [ + "black >=24.10.0", + "flake8 >=6.1.0", + "pre-commit >=2.20.0", + "pytest >=7.4.0, <8.0.0", + "pytest-asyncio >=0.21.0", + "pytest-cov >=6.0.0", +] [project.urls] "Homepage" = "https://github.com/cdisc-org/cdisc-rules-engine" @@ -26,5 +63,4 @@ include-package-data = true py-modules = ["version"] [tool.setuptools.dynamic] -version = { attr = "version.__version__" } -dependencies = {file = ["requirements.txt"]} \ No newline at end of file +version = { attr = "version.__version__" } \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index ac709f651..000000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,7 +0,0 @@ --r requirements.txt -black==24.10.0 -flake8==6.1.0 -pre-commit==2.20.0 -pytest==7.4.0 -pytest-asyncio==0.21.0 -pytest-cov==6.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4481051f6..61556482d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# Lockfile: exact pinned versions for reproducible installs. +# Dependency constraints are defined in pyproject.toml. business_rules_enhanced==1.4.8 cachetools==6.1.0 cdisc-library-client==0.1.6 diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index aff6c25e8..350c2e8dc 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -149,7 +149,7 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): expected_results["dm.xpt"], expected_results["ae.xpt"], ] - ).astype(object) + ).astype(object).sort_values("dataset_location").reset_index(drop=True) result_df = result.data[expected_df.columns].reset_index(drop=True)