diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml index f455d0b78..4c7f70fe0 100644 --- a/.github/workflows/lint-format.yml +++ b/.github/workflows/lint-format.yml @@ -49,7 +49,7 @@ jobs: python-version: "3.12" - name: Install linters run: | - pip install black flake8 + pip install black flake8 -c requirements-dev.txt - name: Run flake8 run: | flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics diff --git a/.github/workflows/test-unit.yml b/.github/workflows/test-unit.yml index d11d78b30..38a9e55ae 100644 --- a/.github/workflows/test-unit.yml +++ b/.github/workflows/test-unit.yml @@ -16,8 +16,8 @@ jobs: python-version: "3.12" - name: Install requirements run: | - pip install -r requirements.txt - pip install --group dev -e . + pip install -r requirements-dev.txt + pip install -e . - name: Running Tests env: CDISC_LIBRARY_API_KEY: fakekey12341234 diff --git a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py index 328a90e48..fb2374b9c 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py @@ -66,7 +66,7 @@ def build(self): data_contents_with_vlm["variable_value_length"] = data_contents_with_vlm.data[ ["variable_value", "define_vlm_data_type"] ].apply( - lambda row: ValuesDatasetBuilder.calculate_variable_value_length( + lambda row: self.calculate_variable_value_length( row["variable_value"], row["define_vlm_data_type"] ), axis=1, diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index 2609335cd..f966d1d33 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -81,9 +81,9 @@ def __setitem__(self, key, value): array_values = da.from_array(value, chunks=tuple(chunks)) self._data[key] = array_values elif isinstance(value, pd.Series): - pdf = self._data.compute() - pdf[key] = value.values - self._data = dd.from_pandas(pdf, npartitions=1) + self._data = self._data.reset_index() + self._data = self._data.set_index("index") + self._data[key] = value elif isinstance(value, dd.DataFrame): for column in value: self._data[column] = value[column] diff --git a/cdisc_rules_engine/operations/codelist_extensible.py b/cdisc_rules_engine/operations/codelist_extensible.py index cc3ab7042..62cd7b56f 100644 --- a/cdisc_rules_engine/operations/codelist_extensible.py +++ b/cdisc_rules_engine/operations/codelist_extensible.py @@ -32,10 +32,6 @@ def _handle_multiple_versions(self) -> pd.Series: "codelist_code": "string", } ) - cast_cols = {self.params.ct_version: "string"} - if self.params.codelist_code in self.evaluation_dataset.columns: - cast_cols[self.params.codelist_code] = "string" - self.evaluation_dataset = self.evaluation_dataset.astype(cast_cols) if self.params.codelist_code in self.evaluation_dataset.columns: is_extensible = self.evaluation_dataset.merge( ct_df.data, diff --git a/cdisc_rules_engine/operations/codelist_terms.py b/cdisc_rules_engine/operations/codelist_terms.py index 5ddcf21ef..90a0a6d18 100644 --- a/cdisc_rules_engine/operations/codelist_terms.py +++ b/cdisc_rules_engine/operations/codelist_terms.py @@ -64,10 +64,6 @@ def _handle_multiple_versions(self) -> pd.Series: "codelist_code": "string", } ) - cast_cols = {self.params.ct_version: "string"} - if self.params.codelist_code in self.evaluation_dataset.columns: - cast_cols[self.params.codelist_code] = "string" - self.evaluation_dataset = self.evaluation_dataset.astype(cast_cols) if self.params.codelist_code in self.evaluation_dataset.columns: result = self.evaluation_dataset.merge( ct_df.data, diff --git a/cdisc_rules_engine/services/data_readers/csv_reader.py b/cdisc_rules_engine/services/data_readers/csv_reader.py index 901e6d695..ae8aa90d0 100644 --- a/cdisc_rules_engine/services/data_readers/csv_reader.py +++ b/cdisc_rules_engine/services/data_readers/csv_reader.py @@ -1,6 +1,6 @@ import tempfile -import dask.dataframe as dd +from dask.dataframe import dd from cdisc_rules_engine.exceptions.custom_exceptions import InvalidCSVFile from cdisc_rules_engine.interfaces import DataReaderInterface diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index f3175bdd3..34ebe2285 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -399,19 +399,8 @@ def __read_node_metadata( } @staticmethod - def __get_full_path(node: DatumInContext) -> str: - parts = [] - current = node - while current is not None and current.context is not None: - parts.append(str(current.path)) - current = current.context - result = "" - for part in reversed(parts): - if part.startswith("["): - result += part - else: - result = (result + "." if result else "") + part - return result + def __get_full_path(node: DatumInContext): + return f"{node.full_path}".replace(".[", "[") def __get_datasets_content_index(self) -> List[dict]: """ diff --git a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py index b96dc4540..f20506c91 100644 --- a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py @@ -61,7 +61,7 @@ def read(self) -> dict: "variable_labels": list(metadata.column_labels), "variable_names": list(metadata.column_names), "variable_formats": [ - "" if (data_type == "NULL" or data_type is None) else data_type + "" if data_type == "NULL" else data_type for data_type in metadata.original_variable_types.values() ], "variable_name_to_label_map": metadata.column_names_to_labels, diff --git a/core.py b/core.py index ea660b867..98f0a5a70 100644 --- a/core.py +++ b/core.py @@ -357,6 +357,7 @@ def load_custom_dotenv_from_data_options(_ctx, _param, value): "-s", "--standard", required=True, + default=None, help="CDISC standard to validate against", envvar="PRODUCT", ) @@ -364,6 +365,7 @@ def load_custom_dotenv_from_data_options(_ctx, _param, value): "-v", "--version", required=True, + default=None, help="Standard version to validate against", envvar="VERSION", ) diff --git a/docs/contributing.md b/docs/contributing.md index f087b60b9..d53a3315e 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -34,7 +34,7 @@ This project enforces consistent formatting and linting via pre-commit hooks. - [`flake8`](https://flake8.pycqa.org/) — Python linter - [`prettier`](https://prettier.io/) — JSON, YAML, and Markdown formatter -Both `black` and `flake8` are included in the `dev` dependency group in `pyproject.toml`. After installing dependencies, install the pre-commit hooks: +Both `black` and `flake8` are included in `requirements-dev.txt`. After installing dependencies, install the pre-commit hooks: ```bash pre-commit install diff --git a/docs/development.md b/docs/development.md index f39f2cccc..bb4c27427 100644 --- a/docs/development.md +++ b/docs/development.md @@ -47,7 +47,7 @@ source venv/bin/activate .\venv\Scripts\Activate # Install dependencies -pip install -e . && pip install --group dev +python -m pip install -r requirements-dev.txt ``` --- diff --git a/pyproject.toml b/pyproject.toml index 7418ff68c..bb4d69066 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,49 +4,12 @@ build-backend = "setuptools.build_meta" [project] name = "cdisc-rules-engine" -dynamic = ["version"] +dynamic = ["version", "dependencies"] description = "Open source offering of the cdisc rules engine" readme = "docs/PYPI.md" requires-python = ">=3.12, <3.13" license = { text = "MIT" } authors = [{ name = "cdisc-org", email = "info@cdisc.org" }] -dependencies = [ - "business_rules_enhanced >=1.4.8", - "cachetools >=6.1.0", - "cdisc-library-client >=0.1.6", - "click >=8.1.7", - "dask[dataframe,array] >=2024.6.0", - "fastparquet >=2024.2.0", - "importlib-metadata >=8.5.0", - "jsonata-python >=0.6.0", - "jsonpath-ng >=1.6.1", - "jsonschema >=4.18.5", - "lxml >=5.2.1", - "numpy >=1.26.0", - "odmlib >=0.1.4", - "openpyxl >=3.1.5", - "pandas >=2.1.4, <3.0.0", - "psutil >=6.1.1", - "pyinstaller >=6.11.0", - "pympler >=1.1", - "pyreadstat >=1.2.7", - "python-dotenv >=1.0.0", - "pyyaml >=6.0.2", - "redis >=4.5.0", - "requests >=2.32.3", - "setuptools >=75.6.0", - "titlecase >=2.4.1", -] - -[dependency-groups] -dev = [ - "black >=24.10.0", - "flake8 >=6.1.0", - "pre-commit >=2.20.0", - "pytest >=7.4.0, <8.0.0", - "pytest-asyncio >=0.21.0", - "pytest-cov >=6.0.0", -] [project.urls] "Homepage" = "https://github.com/cdisc-org/cdisc-rules-engine" @@ -63,4 +26,5 @@ include-package-data = true py-modules = ["version"] [tool.setuptools.dynamic] -version = { attr = "version.__version__" } \ No newline at end of file +version = { attr = "version.__version__" } +dependencies = {file = ["requirements.txt"]} \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..ac709f651 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,7 @@ +-r requirements.txt +black==24.10.0 +flake8==6.1.0 +pre-commit==2.20.0 +pytest==7.4.0 +pytest-asyncio==0.21.0 +pytest-cov==6.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 61556482d..4481051f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ -# Lockfile: exact pinned versions for reproducible installs. -# Dependency constraints are defined in pyproject.toml. business_rules_enhanced==1.4.8 cachetools==6.1.0 cdisc-library-client==0.1.6 diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index 350c2e8dc..aff6c25e8 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -149,7 +149,7 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): expected_results["dm.xpt"], expected_results["ae.xpt"], ] - ).astype(object).sort_values("dataset_location").reset_index(drop=True) + ).astype(object) result_df = result.data[expected_df.columns].reset_index(drop=True)