From 8ae46a9926d71b72c57c660bb68043b39b184400 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:03:13 -0700 Subject: [PATCH 1/8] Define dependencies in pyproject.toml Moves dependency constraints to pyproject.toml. Makes requirements.txt a lockfile. --- .github/workflows/lint-format.yml | 2 +- .github/workflows/test-unit.yml | 4 +-- docs/contributing.md | 2 +- docs/development.md | 2 +- pyproject.toml | 42 ++++++++++++++++++++++++++++--- requirements-dev.txt | 7 ------ requirements.txt | 2 ++ 7 files changed, 46 insertions(+), 15 deletions(-) delete mode 100644 requirements-dev.txt diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml index 4c7f70fe0..f455d0b78 100644 --- a/.github/workflows/lint-format.yml +++ b/.github/workflows/lint-format.yml @@ -49,7 +49,7 @@ jobs: python-version: "3.12" - name: Install linters run: | - pip install black flake8 -c requirements-dev.txt + pip install black flake8 - name: Run flake8 run: | flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics diff --git a/.github/workflows/test-unit.yml b/.github/workflows/test-unit.yml index 38a9e55ae..d11d78b30 100644 --- a/.github/workflows/test-unit.yml +++ b/.github/workflows/test-unit.yml @@ -16,8 +16,8 @@ jobs: python-version: "3.12" - name: Install requirements run: | - pip install -r requirements-dev.txt - pip install -e . + pip install -r requirements.txt + pip install --group dev -e . - name: Running Tests env: CDISC_LIBRARY_API_KEY: fakekey12341234 diff --git a/docs/contributing.md b/docs/contributing.md index d53a3315e..f087b60b9 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -34,7 +34,7 @@ This project enforces consistent formatting and linting via pre-commit hooks. - [`flake8`](https://flake8.pycqa.org/) — Python linter - [`prettier`](https://prettier.io/) — JSON, YAML, and Markdown formatter -Both `black` and `flake8` are included in `requirements-dev.txt`. After installing dependencies, install the pre-commit hooks: +Both `black` and `flake8` are included in the `dev` dependency group in `pyproject.toml`. After installing dependencies, install the pre-commit hooks: ```bash pre-commit install diff --git a/docs/development.md b/docs/development.md index bb4c27427..f39f2cccc 100644 --- a/docs/development.md +++ b/docs/development.md @@ -47,7 +47,7 @@ source venv/bin/activate .\venv\Scripts\Activate # Install dependencies -python -m pip install -r requirements-dev.txt +pip install -e . && pip install --group dev ``` --- diff --git a/pyproject.toml b/pyproject.toml index bb4d69066..53067241a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,49 @@ build-backend = "setuptools.build_meta" [project] name = "cdisc-rules-engine" -dynamic = ["version", "dependencies"] +dynamic = ["version"] description = "Open source offering of the cdisc rules engine" readme = "docs/PYPI.md" requires-python = ">=3.12, <3.13" license = { text = "MIT" } authors = [{ name = "cdisc-org", email = "info@cdisc.org" }] +dependencies = [ + "business_rules_enhanced >=1.4.8", + "cachetools >=6.1.0", + "cdisc-library-client >=0.1.6", + "click >=8.1.7, <8.3.0", + "dask[dataframe,array] >=2024.6.0, <2024.8.1", + "fastparquet >=2024.2.0", + "importlib-metadata >=8.5.0", + "jsonata-python >=0.6.0", + "jsonpath-ng >=1.6.1, <1.8.0", + "jsonschema >=4.18.5", + "lxml >=5.2.1", + "numpy >=1.26.0", + "odmlib >=0.1.4", + "openpyxl >=3.1.5", + "pandas >=2.1.4, <2.2.0", + "psutil >=6.1.1", + "pyinstaller >=6.11.0", + "pympler >=1.1", + "pyreadstat >=1.2.7, <1.2.9", + "python-dotenv >=1.0.0", + "pyyaml >=6.0.2", + "redis >=4.5.0", + "requests >=2.32.3", + "setuptools >=75.6.0", + "titlecase >=2.4.1", +] + +[dependency-groups] +dev = [ + "black >=24.10.0", + "flake8 >=6.1.0", + "pre-commit >=2.20.0", + "pytest >=7.4.0, <8.0.0", + "pytest-asyncio >=0.21.0", + "pytest-cov >=6.0.0", +] [project.urls] "Homepage" = "https://github.com/cdisc-org/cdisc-rules-engine" @@ -26,5 +63,4 @@ include-package-data = true py-modules = ["version"] [tool.setuptools.dynamic] -version = { attr = "version.__version__" } -dependencies = {file = ["requirements.txt"]} \ No newline at end of file +version = { attr = "version.__version__" } \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index ac709f651..000000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,7 +0,0 @@ --r requirements.txt -black==24.10.0 -flake8==6.1.0 -pre-commit==2.20.0 -pytest==7.4.0 -pytest-asyncio==0.21.0 -pytest-cov==6.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4481051f6..61556482d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# Lockfile: exact pinned versions for reproducible installs. +# Dependency constraints are defined in pyproject.toml. business_rules_enhanced==1.4.8 cachetools==6.1.0 cdisc-library-client==0.1.6 From c6d67553628dca6e18563af5073766a5f96b9a30 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:08:54 -0700 Subject: [PATCH 2/8] Support click 8.3.0 Fixes an incompatibility caused by click 8.3.0, which passes the default value as-is. --- core.py | 2 -- pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/core.py b/core.py index 98f0a5a70..ea660b867 100644 --- a/core.py +++ b/core.py @@ -357,7 +357,6 @@ def load_custom_dotenv_from_data_options(_ctx, _param, value): "-s", "--standard", required=True, - default=None, help="CDISC standard to validate against", envvar="PRODUCT", ) @@ -365,7 +364,6 @@ def load_custom_dotenv_from_data_options(_ctx, _param, value): "-v", "--version", required=True, - default=None, help="Standard version to validate against", envvar="VERSION", ) diff --git a/pyproject.toml b/pyproject.toml index 53067241a..cc3f2749b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "business_rules_enhanced >=1.4.8", "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", - "click >=8.1.7, <8.3.0", + "click >=8.1.7", "dask[dataframe,array] >=2024.6.0, <2024.8.1", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", From 024eaf2a19ace0a3c7b18446f0e15092f574a193 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:09:40 -0700 Subject: [PATCH 3/8] Support pyreadstat 1.2.9 Fixes an incompatibility caused by pyreadstat 1.2.9, which changed original_variable_type from 'NULL' to None --- cdisc_rules_engine/services/datasetxpt_metadata_reader.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py index f20506c91..b96dc4540 100644 --- a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py @@ -61,7 +61,7 @@ def read(self) -> dict: "variable_labels": list(metadata.column_labels), "variable_names": list(metadata.column_names), "variable_formats": [ - "" if data_type == "NULL" else data_type + "" if (data_type == "NULL" or data_type is None) else data_type for data_type in metadata.original_variable_types.values() ], "variable_name_to_label_map": metadata.column_names_to_labels, diff --git a/pyproject.toml b/pyproject.toml index cc3f2749b..d2ecefed3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "psutil >=6.1.1", "pyinstaller >=6.11.0", "pympler >=1.1", - "pyreadstat >=1.2.7, <1.2.9", + "pyreadstat >=1.2.7", "python-dotenv >=1.0.0", "pyyaml >=6.0.2", "redis >=4.5.0", From 0c89818393b238f5d25509db86b335f895a44e95 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:11:02 -0700 Subject: [PATCH 4/8] Support jsonpath-ng 1.8.0 Works around an behavior change in jsonpath-ng 1.8.0 where Child.str gets wrapped in parenthesis. --- .../services/data_services/usdm_data_service.py | 15 +++++++++++++-- pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 34ebe2285..f3175bdd3 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -399,8 +399,19 @@ def __read_node_metadata( } @staticmethod - def __get_full_path(node: DatumInContext): - return f"{node.full_path}".replace(".[", "[") + def __get_full_path(node: DatumInContext) -> str: + parts = [] + current = node + while current is not None and current.context is not None: + parts.append(str(current.path)) + current = current.context + result = "" + for part in reversed(parts): + if part.startswith("["): + result += part + else: + result = (result + "." if result else "") + part + return result def __get_datasets_content_index(self) -> List[dict]: """ diff --git a/pyproject.toml b/pyproject.toml index d2ecefed3..bdf60450e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", - "jsonpath-ng >=1.6.1, <1.8.0", + "jsonpath-ng >=1.6.1", "jsonschema >=4.18.5", "lxml >=5.2.1", "numpy >=1.26.0", From 70fbadbe051dd20f07368a121e3084a0e538549f Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Thu, 4 Jun 2026 16:19:11 -0700 Subject: [PATCH 5/8] Suport dask 2024.8.1 Fixes tokenization errors when using dask 2024.8.1+. Starting with this version, dask enforces that tokens remain stable across pickle round-trips (dask/dask#11320). Capturing self in a lambda fails this check because instance objects can have non-deterministic pickle representations. Since calculate_variable_value_length is already a static method, replacing self with the class name is enough to remove the capture. --- .../dataset_builders/contents_define_vlm_dataset_builder.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py index fb2374b9c..328a90e48 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py @@ -66,7 +66,7 @@ def build(self): data_contents_with_vlm["variable_value_length"] = data_contents_with_vlm.data[ ["variable_value", "define_vlm_data_type"] ].apply( - lambda row: self.calculate_variable_value_length( + lambda row: ValuesDatasetBuilder.calculate_variable_value_length( row["variable_value"], row["define_vlm_data_type"] ), axis=1, diff --git a/pyproject.toml b/pyproject.toml index bdf60450e..4e5a5664d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", "click >=8.1.7", - "dask[dataframe,array] >=2024.6.0, <2024.8.1", + "dask[dataframe,array] >=2024.6.0, <2024.12.1", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", From 3fb77e75c74a2ea2363704125de96bf449828ea5 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Thu, 4 Jun 2026 16:19:35 -0700 Subject: [PATCH 6/8] Support dask 2024.12.1 Fixes an import error caused by dask 2024.12.1, which removed the legacy dask.dataframe.dd submodule (dask/dask#11604). Changes the import to `import dask.dataframe as dd`, consistent with every other file in the codebase. --- cdisc_rules_engine/services/data_readers/csv_reader.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/services/data_readers/csv_reader.py b/cdisc_rules_engine/services/data_readers/csv_reader.py index ae8aa90d0..901e6d695 100644 --- a/cdisc_rules_engine/services/data_readers/csv_reader.py +++ b/cdisc_rules_engine/services/data_readers/csv_reader.py @@ -1,6 +1,6 @@ import tempfile -from dask.dataframe import dd +import dask.dataframe as dd from cdisc_rules_engine.exceptions.custom_exceptions import InvalidCSVFile from cdisc_rules_engine.interfaces import DataReaderInterface diff --git a/pyproject.toml b/pyproject.toml index 4e5a5664d..75d487fb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", "click >=8.1.7", - "dask[dataframe,array] >=2024.6.0, <2024.12.1", + "dask[dataframe,array] >=2024.6.0, <2025.4.0", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", From c95bcfb8beac22cb2689e648f189e9b75db13a0c Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Thu, 4 Jun 2026 16:20:21 -0700 Subject: [PATCH 7/8] Support dask 2025.4.0 Dask 2025.4.0 optimizes multiple DataFrames together, which exposes division mismatches when assigning a pandas Series to a dask DataFrame column. The old reset_index/set_index workaround no longer avoids this. Replacing it with compute-assign-rewrap via dd.from_pandas, which builds a clean expression graph. This is safe because __getitem__ already computes the DataFrame to produce the Series being assigned. --- cdisc_rules_engine/models/dataset/dask_dataset.py | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index f966d1d33..2609335cd 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -81,9 +81,9 @@ def __setitem__(self, key, value): array_values = da.from_array(value, chunks=tuple(chunks)) self._data[key] = array_values elif isinstance(value, pd.Series): - self._data = self._data.reset_index() - self._data = self._data.set_index("index") - self._data[key] = value + pdf = self._data.compute() + pdf[key] = value.values + self._data = dd.from_pandas(pdf, npartitions=1) elif isinstance(value, dd.DataFrame): for column in value: self._data[column] = value[column] diff --git a/pyproject.toml b/pyproject.toml index 75d487fb6..46e20e36e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", "click >=8.1.7", - "dask[dataframe,array] >=2024.6.0, <2025.4.0", + "dask[dataframe,array] >=2024.6.0", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", From 4e9f24eea40b8c450a77f3314c8e9d8b9320eb7a Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Thu, 4 Jun 2026 16:21:28 -0700 Subject: [PATCH 8/8] Support pandas 2.2.0 Fixes a unit test to support pandas 2.2.0+. The pandas release fixes a sorting bug with https://github.com/pandas-dev/pandas/pull/54611. This commit changes the expected results accordingly. Also fixes a merge type mismatch introduced by upstream #1709: the codelist metadata side was cast to StringDtype but the evaluation dataset side was not. With pandas 2.2.0, empty columns infer as float64, and merging float64 with string is rejected. Casting both sides to string before the merge resolves this. --- cdisc_rules_engine/operations/codelist_extensible.py | 4 ++++ cdisc_rules_engine/operations/codelist_terms.py | 4 ++++ pyproject.toml | 2 +- .../test_dataset_metadata_define_dataset_builder.py | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/operations/codelist_extensible.py b/cdisc_rules_engine/operations/codelist_extensible.py index 62cd7b56f..cc3ab7042 100644 --- a/cdisc_rules_engine/operations/codelist_extensible.py +++ b/cdisc_rules_engine/operations/codelist_extensible.py @@ -32,6 +32,10 @@ def _handle_multiple_versions(self) -> pd.Series: "codelist_code": "string", } ) + cast_cols = {self.params.ct_version: "string"} + if self.params.codelist_code in self.evaluation_dataset.columns: + cast_cols[self.params.codelist_code] = "string" + self.evaluation_dataset = self.evaluation_dataset.astype(cast_cols) if self.params.codelist_code in self.evaluation_dataset.columns: is_extensible = self.evaluation_dataset.merge( ct_df.data, diff --git a/cdisc_rules_engine/operations/codelist_terms.py b/cdisc_rules_engine/operations/codelist_terms.py index 90a0a6d18..5ddcf21ef 100644 --- a/cdisc_rules_engine/operations/codelist_terms.py +++ b/cdisc_rules_engine/operations/codelist_terms.py @@ -64,6 +64,10 @@ def _handle_multiple_versions(self) -> pd.Series: "codelist_code": "string", } ) + cast_cols = {self.params.ct_version: "string"} + if self.params.codelist_code in self.evaluation_dataset.columns: + cast_cols[self.params.codelist_code] = "string" + self.evaluation_dataset = self.evaluation_dataset.astype(cast_cols) if self.params.codelist_code in self.evaluation_dataset.columns: result = self.evaluation_dataset.merge( ct_df.data, diff --git a/pyproject.toml b/pyproject.toml index 46e20e36e..7418ff68c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "numpy >=1.26.0", "odmlib >=0.1.4", "openpyxl >=3.1.5", - "pandas >=2.1.4, <2.2.0", + "pandas >=2.1.4, <3.0.0", "psutil >=6.1.1", "pyinstaller >=6.11.0", "pympler >=1.1", diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index aff6c25e8..350c2e8dc 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -149,7 +149,7 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): expected_results["dm.xpt"], expected_results["ae.xpt"], ] - ).astype(object) + ).astype(object).sort_values("dataset_location").reset_index(drop=True) result_df = result.data[expected_df.columns].reset_index(drop=True)