From 9982a8867880f5c40b985f751401df4184bdb4e6 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:03:13 -0700 Subject: [PATCH 01/15] Define dependencies in pyproject.toml Moves dependency constraints to pyproject.toml. Makes requirements.txt a lockfile. --- .github/workflows/lint-format.yml | 2 +- .github/workflows/test-unit.yml | 4 +-- README.md | 8 ++---- pyproject.toml | 42 ++++++++++++++++++++++++++++--- requirements-dev.txt | 7 ------ requirements.txt | 2 ++ 6 files changed, 46 insertions(+), 19 deletions(-) delete mode 100644 requirements-dev.txt diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml index 4c7f70fe0..f455d0b78 100644 --- a/.github/workflows/lint-format.yml +++ b/.github/workflows/lint-format.yml @@ -49,7 +49,7 @@ jobs: python-version: "3.12" - name: Install linters run: | - pip install black flake8 -c requirements-dev.txt + pip install black flake8 - name: Run flake8 run: | flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics diff --git a/.github/workflows/test-unit.yml b/.github/workflows/test-unit.yml index 38a9e55ae..d11d78b30 100644 --- a/.github/workflows/test-unit.yml +++ b/.github/workflows/test-unit.yml @@ -16,8 +16,8 @@ jobs: python-version: "3.12" - name: Install requirements run: | - pip install -r requirements-dev.txt - pip install -e . + pip install -r requirements.txt + pip install --group dev -e . - name: Running Tests env: CDISC_LIBRARY_API_KEY: fakekey12341234 diff --git a/README.md b/README.md index 4340249dc..4b7806a43 100644 --- a/README.md +++ b/README.md @@ -646,11 +646,7 @@ These steps should be run before running any tests or core commands using the no - Install the requirements: - ```bash - python -m pip install -r requirements-dev.txt - ``` - - Run this from the root directory. + `pip install -e . && pip install --group dev` # From the root directory ### Creating an executable version @@ -724,7 +720,7 @@ py -m twine upload --repository {repository_name} dist/* This project uses the `black` code formatter, `flake8` linter for python and `prettier` for JSON, YAML and MD. It also uses `pre-commit` to run `black`, `flake8` and `prettier` when you commit. -Both dependencies are added to _requirements-dev.txt_. +Both dependencies are added to the `dev` dependency group in _pyproject.toml_. Setting up `pre-commit` requires one extra step. After installing it you have to run: diff --git a/pyproject.toml b/pyproject.toml index d0da6d416..191c5a3aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,49 @@ build-backend = "setuptools.build_meta" [project] name = "cdisc-rules-engine" -dynamic = ["version", "dependencies"] +dynamic = ["version"] description = "Open source offering of the cdisc rules engine" readme = "PYPI.md" requires-python = ">=3.12, <3.13" license = { text = "MIT" } authors = [{ name = "cdisc-org", email = "info@cdisc.org" }] +dependencies = [ + "business_rules_enhanced >=1.4.8", + "cachetools >=6.1.0", + "cdisc-library-client >=0.1.6", + "click >=8.1.7, <8.3.0", + "dask[dataframe,array] >=2024.6.0, <2024.8.1", + "fastparquet >=2024.2.0", + "importlib-metadata >=8.5.0", + "jsonata-python >=0.6.0", + "jsonpath-ng >=1.6.1, <1.8.0", + "jsonschema >=4.18.5", + "lxml >=5.2.1", + "numpy >=1.26.0", + "odmlib >=0.1.4", + "openpyxl >=3.1.5", + "pandas >=2.1.4, <2.2.0", + "psutil >=6.1.1", + "pyinstaller >=6.11.0", + "pympler >=1.1", + "pyreadstat >=1.2.7, <1.2.9", + "python-dotenv >=1.0.0", + "pyyaml >=6.0.2", + "redis >=4.5.0", + "requests >=2.32.3", + "setuptools >=75.6.0", + "titlecase >=2.4.1", +] + +[dependency-groups] +dev = [ + "black >=24.10.0", + "flake8 >=6.1.0", + "pre-commit >=2.20.0", + "pytest >=7.4.0, <8.0.0", + "pytest-asyncio >=0.21.0", + "pytest-cov >=6.0.0", +] [project.urls] "Homepage" = "https://github.com/cdisc-org/cdisc-rules-engine" @@ -26,5 +63,4 @@ include-package-data = true py-modules = ["version"] [tool.setuptools.dynamic] -version = { attr = "version.__version__" } -dependencies = {file = ["requirements.txt"]} \ No newline at end of file +version = { attr = "version.__version__" } \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index ac709f651..000000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,7 +0,0 @@ --r requirements.txt -black==24.10.0 -flake8==6.1.0 -pre-commit==2.20.0 -pytest==7.4.0 -pytest-asyncio==0.21.0 -pytest-cov==6.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4481051f6..61556482d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# Lockfile: exact pinned versions for reproducible installs. +# Dependency constraints are defined in pyproject.toml. business_rules_enhanced==1.4.8 cachetools==6.1.0 cdisc-library-client==0.1.6 From 55c516076ada93f6978d0997f516650339c24166 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:08:54 -0700 Subject: [PATCH 02/15] Support click 8.3.0 Fixes an incompatibility caused by click 8.3.0, which passes the default value as-is. --- core.py | 2 -- pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/core.py b/core.py index ea1ff2ce6..8b162a66b 100644 --- a/core.py +++ b/core.py @@ -357,7 +357,6 @@ def load_custom_dotenv_from_data_options(ctx, param, value): "-s", "--standard", required=True, - default=None, help="CDISC standard to validate against", envvar="PRODUCT", ) @@ -365,7 +364,6 @@ def load_custom_dotenv_from_data_options(ctx, param, value): "-v", "--version", required=True, - default=None, help="Standard version to validate against", envvar="VERSION", ) diff --git a/pyproject.toml b/pyproject.toml index 191c5a3aa..6a3c2891e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "business_rules_enhanced >=1.4.8", "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", - "click >=8.1.7, <8.3.0", + "click >=8.1.7", "dask[dataframe,array] >=2024.6.0, <2024.8.1", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", From 92772d8256b2b6088b81aeea691731e7701fd8ae Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:09:40 -0700 Subject: [PATCH 03/15] Support pyreadstat 1.2.9 Fixes an incompatibility caused by pyreadstat 1.2.9, which changed original_variable_type from 'NULL' to None --- cdisc_rules_engine/services/datasetxpt_metadata_reader.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py index 02c1a5ff1..dd68ccabb 100644 --- a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py @@ -61,7 +61,7 @@ def read(self) -> dict: "variable_labels": list(metadata.column_labels), "variable_names": list(metadata.column_names), "variable_formats": [ - "" if data_type == "NULL" else data_type + "" if (data_type == "NULL" or data_type is None) else data_type for data_type in metadata.original_variable_types.values() ], "variable_name_to_label_map": metadata.column_names_to_labels, diff --git a/pyproject.toml b/pyproject.toml index 6a3c2891e..336321cc3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "psutil >=6.1.1", "pyinstaller >=6.11.0", "pympler >=1.1", - "pyreadstat >=1.2.7, <1.2.9", + "pyreadstat >=1.2.7", "python-dotenv >=1.0.0", "pyyaml >=6.0.2", "redis >=4.5.0", From 9e55d8a0d15147f22b8bacd7f8e162ac0061fdbe Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:11:02 -0700 Subject: [PATCH 04/15] Support jsonpath-ng 1.8.0 Works around an behavior change in jsonpath-ng 1.8.0 where Child.str gets wrapped in parenthesis. --- .../services/data_services/usdm_data_service.py | 15 +++++++++++++-- pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 83052f2e6..5b3cc6a99 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -417,8 +417,19 @@ def __read_node_metadata( } @staticmethod - def __get_full_path(node: DatumInContext): - return f"{node.full_path}".replace(".[", "[") + def __get_full_path(node: DatumInContext) -> str: + parts = [] + current = node + while current is not None and current.context is not None: + parts.append(str(current.path)) + current = current.context + result = "" + for part in reversed(parts): + if part.startswith("["): + result += part + else: + result = (result + "." if result else "") + part + return result def __get_datasets_content_index(self) -> List[dict]: """ diff --git a/pyproject.toml b/pyproject.toml index 336321cc3..dae7f8488 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", - "jsonpath-ng >=1.6.1, <1.8.0", + "jsonpath-ng >=1.6.1", "jsonschema >=4.18.5", "lxml >=5.2.1", "numpy >=1.26.0", From 7a81922c43f1b4c559530d7b4d3a6603a8539ed7 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:11:47 -0700 Subject: [PATCH 05/15] Suport dask 2024.8.1 Fixes tokenization errors when using dask 2024.8.1+. Starting with this version, dask enforces that tokens remain stable across pickle round-trips (dask/dask#11320). Capturing self in a lambda fails this check because instance objects can have non-deterministic pickle representations. Since calculate_variable_value_length is already a static method, replacing self with the class name is enough to remove the capture. --- .../dataset_builders/contents_define_vlm_dataset_builder.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py index fb2374b9c..328a90e48 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py @@ -66,7 +66,7 @@ def build(self): data_contents_with_vlm["variable_value_length"] = data_contents_with_vlm.data[ ["variable_value", "define_vlm_data_type"] ].apply( - lambda row: self.calculate_variable_value_length( + lambda row: ValuesDatasetBuilder.calculate_variable_value_length( row["variable_value"], row["define_vlm_data_type"] ), axis=1, diff --git a/pyproject.toml b/pyproject.toml index dae7f8488..6ac810c53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", "click >=8.1.7", - "dask[dataframe,array] >=2024.6.0, <2024.8.1", + "dask[dataframe,array] >=2024.6.0, <2025.4.0", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", From 0b5e61743e7b83ef92baa8f31133e231168536cd Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:12:31 -0700 Subject: [PATCH 06/15] Support dask 2025.4.0 Dask 2025.4.0 optimizes multiple DataFrames together, which exposes division mismatches and causes dask to throw an error. This change removes a source of repartitioning, preserving the divisions when assigning a pandas series to a dask dataframe --- cdisc_rules_engine/models/dataset/dask_dataset.py | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index 7a6449d31..8cb84e470 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -81,9 +81,9 @@ def __setitem__(self, key, value): array_values = da.from_array(value, chunks=tuple(chunks)) self._data[key] = array_values elif isinstance(value, pd.Series): - self._data = self._data.reset_index() - self._data = self._data.set_index("index") - self._data[key] = value + chunks = self._data.map_partitions(lambda x: len(x)).compute().to_numpy() + array_values = da.from_array(value.values, chunks=tuple(chunks)) + self._data[key] = array_values elif isinstance(value, dd.DataFrame): for column in value: self._data[column] = value[column] diff --git a/pyproject.toml b/pyproject.toml index 6ac810c53..cd75e8122 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", "click >=8.1.7", - "dask[dataframe,array] >=2024.6.0, <2025.4.0", + "dask[dataframe,array] >=2024.6.0", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", From a646ccfa167e68fdb23e17569b6e86dada8c15c6 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:43:41 -0700 Subject: [PATCH 07/15] Support pandas 2.2.0 Fixes a unit test to support pandas 2.2.0+. The pandas release fixes an sorting bug with https://github.com/pandas-dev/pandas/pull/54611. This commit changes the expected results accordingly. --- pyproject.toml | 2 +- .../test_dataset_metadata_define_dataset_builder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cd75e8122..d2c89a6ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "numpy >=1.26.0", "odmlib >=0.1.4", "openpyxl >=3.1.5", - "pandas >=2.1.4, <2.2.0", + "pandas >=2.1.4, <3.0.0", "psutil >=6.1.1", "pyinstaller >=6.11.0", "pympler >=1.1", diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index aff6c25e8..350c2e8dc 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -149,7 +149,7 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): expected_results["dm.xpt"], expected_results["ae.xpt"], ] - ).astype(object) + ).astype(object).sort_values("dataset_location").reset_index(drop=True) result_df = result.data[expected_df.columns].reset_index(drop=True) From 8ac3ed5f0892a2e823fe3c90adc36cdc6e9c6eb0 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Wed, 20 May 2026 11:05:24 -0700 Subject: [PATCH 08/15] chore: drop pandas <3.0 upper bound; add pytz --- pyproject.toml | 3 ++- requirements.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d2c89a6ef..8c2c7566d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,9 @@ dependencies = [ "numpy >=1.26.0", "odmlib >=0.1.4", "openpyxl >=3.1.5", - "pandas >=2.1.4, <3.0.0", + "pandas >=2.1.4", "psutil >=6.1.1", + "pytz >=2020.1", "pyinstaller >=6.11.0", "pympler >=1.1", "pyreadstat >=1.2.7", diff --git a/requirements.txt b/requirements.txt index 61556482d..9ba35e062 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,7 @@ pyinstaller==6.11.0 Pympler==1.1 pyreadstat==1.2.7 python-dotenv==1.0.0 +pytz==2026.2 pyyaml==6.0.2 redis==4.5.0 requests~=2.32.3 From b09d2b7c5477b248f282014c35cdbe30ba1e9130 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Wed, 20 May 2026 11:05:36 -0700 Subject: [PATCH 09/15] fix: replace applymap() removed in pandas 3.0 --- cdisc_rules_engine/services/data_readers/dataset_json_reader.py | 2 +- .../services/data_readers/dataset_ndjson_reader.py | 2 +- cdisc_rules_engine/services/data_readers/parquet_reader.py | 2 +- cdisc_rules_engine/services/data_readers/xpt_reader.py | 2 +- cdisc_rules_engine/services/data_services/dummy_data_service.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cdisc_rules_engine/services/data_readers/dataset_json_reader.py b/cdisc_rules_engine/services/data_readers/dataset_json_reader.py index 71e312528..d95824a3c 100644 --- a/cdisc_rules_engine/services/data_readers/dataset_json_reader.py +++ b/cdisc_rules_engine/services/data_readers/dataset_json_reader.py @@ -36,7 +36,7 @@ def _raw_dataset_from_file(self, file_path) -> pd.DataFrame: [item for item in datasetjson.get("rows", [])], columns=[item["name"] for item in datasetjson.get("columns", [])], ) - return df.applymap(lambda x: round(x, 15) if isinstance(x, float) else x) + return df.map(lambda x: round(x, 15) if isinstance(x, float) else x) def from_file(self, file_path): try: diff --git a/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py b/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py index 48b998e40..c2652343e 100644 --- a/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py +++ b/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py @@ -45,7 +45,7 @@ def _raw_dataset_from_file(self, file_path) -> pd.DataFrame: [item for item in datandjson], columns=[item["name"] for item in metadatandjson.get("columns", [])], ) - return df.applymap(lambda x: round(x, 15) if isinstance(x, float) else x) + return df.map(lambda x: round(x, 15) if isinstance(x, float) else x) def from_file(self, file_path): try: diff --git a/cdisc_rules_engine/services/data_readers/parquet_reader.py b/cdisc_rules_engine/services/data_readers/parquet_reader.py index 6e7867b63..1df298969 100644 --- a/cdisc_rules_engine/services/data_readers/parquet_reader.py +++ b/cdisc_rules_engine/services/data_readers/parquet_reader.py @@ -32,7 +32,7 @@ def from_file(self, file_path): def _format_floats( self, dataframe: Union[pd.DataFrame, dd.DataFrame] ) -> Union[pd.DataFrame, dd.DataFrame]: - return dataframe.applymap(lambda x: round(x, 15) if isinstance(x, float) else x) + return dataframe.map(lambda x: round(x, 15) if isinstance(x, float) else x) def _read_dask(self, file_path): data = dd.read_parquet(file_path) diff --git a/cdisc_rules_engine/services/data_readers/xpt_reader.py b/cdisc_rules_engine/services/data_readers/xpt_reader.py index d20e1e85d..f2068bedc 100644 --- a/cdisc_rules_engine/services/data_readers/xpt_reader.py +++ b/cdisc_rules_engine/services/data_readers/xpt_reader.py @@ -50,4 +50,4 @@ def from_file(self, file_path): return self._read_pandas(file_path) def _format_floats(self, dataframe: pd.DataFrame) -> pd.DataFrame: - return dataframe.applymap(lambda x: round(x, 15) if isinstance(x, float) else x) + return dataframe.map(lambda x: round(x, 15) if isinstance(x, float) else x) diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index a3a98b978..2f846217d 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -63,7 +63,7 @@ def get_dataset(self, dataset_name: str, **params) -> PandasDataset: dataset: Optional[DummyDataset] = self.get_dataset_data(dataset_name) if dataset is not None: df: pd.DataFrame = dataset.data - df = df.applymap(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x) + df = df.map(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x) result = PandasDataset(df) return result else: From 6a330bf28f1023f3c003f54a974710e0bd632371 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Wed, 20 May 2026 11:06:01 -0700 Subject: [PATCH 10/15] fix: replace inplace=True mutation (pandas 3.0 CoW) --- cdisc_rules_engine/check_operators/dataframe_operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 78e4f5a9f..82af82743 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -1499,7 +1499,7 @@ def check_inconsistency(row): return df.apply(check_inconsistency, axis=1) def next_column_exists_and_previous_is_null(self, row) -> bool: - row.reset_index(drop=True, inplace=True) + row = row.reset_index(drop=True) for index in row[ row.isin(NULL_FLAVORS) | pd.isna(row) ].index: # leaving null values only From 4badf8a15f931854ebd7520fd53e11b71add6682 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Wed, 20 May 2026 11:06:36 -0700 Subject: [PATCH 11/15] fix: handle pandas 3.0 default StringDtype --- .../check_operators/dataframe_operators.py | 50 +++++++++++-------- cdisc_rules_engine/check_operators/helpers.py | 2 +- cdisc_rules_engine/operations/record_count.py | 2 +- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 82af82743..1db51c152 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -228,8 +228,8 @@ def _check_equality( target_val = custom_str_conversion(target_val) comparison_val = custom_str_conversion(comparison_val) if case_insensitive: - target_val = target_val.lower() if target_val else None - comparison_val = comparison_val.lower() if comparison_val else None + target_val = target_val.lower() if isinstance(target_val, str) and target_val else None + comparison_val = comparison_val.lower() if isinstance(comparison_val, str) and comparison_val else None return target_val == comparison_val return target_val == comparison_val @@ -275,8 +275,8 @@ def _check_inequality( target_val = custom_str_conversion(target_val) comparison_val = custom_str_conversion(comparison_val) if case_insensitive: - target_val = target_val.lower() if target_val else None - comparison_val = comparison_val.lower() if comparison_val else None + target_val = target_val.lower() if isinstance(target_val, str) and target_val else None + comparison_val = comparison_val.lower() if isinstance(comparison_val, str) and comparison_val else None return target_val != comparison_val return target_val != comparison_val @@ -698,6 +698,12 @@ def is_contained_by_case_insensitive(self, other_value): def is_not_contained_by_case_insensitive(self, other_value): return ~self.is_contained_by_case_insensitive(other_value) + @staticmethod + def _map_regex(series, func): + # pandas 3 returns nullable BooleanDtype from .map(); normalize to numpy + # bool so ~ and & behave identically for both positive and negated callers. + return series.map(func, na_action="ignore").fillna(False).astype(bool) + @log_operator_execution @type_operator(FIELD_DATAFRAME) def prefix_matches_regex(self, other_value): @@ -707,10 +713,10 @@ def prefix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & converted_strings.astype(str).map( - lambda x: re.search(comparator, x[:prefix]) is not None + return converted_strings.notna() & self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[:prefix]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -721,10 +727,10 @@ def not_prefix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & ~converted_strings.astype(str).map( - lambda x: re.search(comparator, x[:prefix]) is not None + return converted_strings.notna() & ~self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[:prefix]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -735,10 +741,10 @@ def suffix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & converted_strings.astype(str).map( - lambda x: re.search(comparator, x[-suffix:]) is not None + return converted_strings.notna() & self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[-suffix:]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -749,10 +755,10 @@ def not_suffix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & ~converted_strings.astype(str).map( - lambda x: re.search(comparator, x[-suffix:]) is not None + return converted_strings.notna() & ~self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[-suffix:]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -762,10 +768,10 @@ def matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & converted_strings.astype(str).str.match( - comparator + return converted_strings.notna() & self._map_regex( + converted_strings.astype(str), + lambda x: re.match(comparator, x) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -775,10 +781,10 @@ def not_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & ~converted_strings.astype(str).str.match( - comparator + return converted_strings.notna() & ~self._map_regex( + converted_strings.astype(str), + lambda x: re.match(comparator, x) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 4a9348b8e..caaae4089 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -56,7 +56,7 @@ def default_value(self): def is_valid_date(date_string: str) -> bool: - if date_string is None: + if not isinstance(date_string, str): return False try: isoparse(date_string) diff --git a/cdisc_rules_engine/operations/record_count.py b/cdisc_rules_engine/operations/record_count.py index 017b260f1..aa4579d08 100644 --- a/cdisc_rules_engine/operations/record_count.py +++ b/cdisc_rules_engine/operations/record_count.py @@ -169,7 +169,7 @@ def _build_effective_grouping(self) -> tuple[list, dict]: if self.params.dataframe[col].isna().all(): all_na_cols[col] = None elif ( - self.params.dataframe[col].dtype == "object" + pd.api.types.is_string_dtype(self.params.dataframe[col]) and self.params.dataframe[col].fillna("").str.strip().eq("").all() ): all_na_cols[col] = "" From 8a9ba0b1ad8fe3cbf8d98fab864704ac0fa277d7 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Fri, 22 May 2026 16:35:05 -0700 Subject: [PATCH 12/15] fix: handle extension arrays in DaskDataset.__setitem__ --- cdisc_rules_engine/models/dataset/dask_dataset.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index 8cb84e470..101aa1698 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -81,9 +81,17 @@ def __setitem__(self, key, value): array_values = da.from_array(value, chunks=tuple(chunks)) self._data[key] = array_values elif isinstance(value, pd.Series): - chunks = self._data.map_partitions(lambda x: len(x)).compute().to_numpy() - array_values = da.from_array(value.values, chunks=tuple(chunks)) - self._data[key] = array_values + if not isinstance(value.values, np.ndarray): + # Extension array (e.g. StringDtype): da.from_array() cannot handle it; + # materialize to pandas, assign positionally, and re-partition. + npartitions = self._data.npartitions + pandas_df = self._data.compute() + pandas_df[key] = value.values + self._data = dd.from_pandas(pandas_df, npartitions=npartitions) + else: + chunks = self._data.map_partitions(lambda x: len(x)).compute().to_numpy() + array_values = da.from_array(value.values, chunks=tuple(chunks)) + self._data[key] = array_values elif isinstance(value, dd.DataFrame): for column in value: self._data[column] = value[column] From b71379102f555333fce598bf8aeab73d3dc2cfe6 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Fri, 22 May 2026 16:38:39 -0700 Subject: [PATCH 13/15] fix: remove unsupported dd.DataFrame type in parquet_reader --- cdisc_rules_engine/services/data_readers/parquet_reader.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cdisc_rules_engine/services/data_readers/parquet_reader.py b/cdisc_rules_engine/services/data_readers/parquet_reader.py index 1df298969..953c246d4 100644 --- a/cdisc_rules_engine/services/data_readers/parquet_reader.py +++ b/cdisc_rules_engine/services/data_readers/parquet_reader.py @@ -1,5 +1,4 @@ from io import BytesIO -from typing import Union import pandas as pd import dask.dataframe as dd @@ -29,9 +28,7 @@ def from_file(self, file_path): file_path ) - def _format_floats( - self, dataframe: Union[pd.DataFrame, dd.DataFrame] - ) -> Union[pd.DataFrame, dd.DataFrame]: + def _format_floats(self, dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe.map(lambda x: round(x, 15) if isinstance(x, float) else x) def _read_dask(self, file_path): From cf90ca5b0dfc4e6f7350759924fe5b1268278011 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Fri, 22 May 2026 16:39:14 -0700 Subject: [PATCH 14/15] fix: remove method= and downcast= removed in pandas 3.0 --- cdisc_rules_engine/models/dataset/dask_dataset.py | 4 +--- cdisc_rules_engine/models/dataset/dataset_interface.py | 2 -- cdisc_rules_engine/models/dataset/pandas_dataset.py | 4 ---- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index 101aa1698..08a8649cc 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -356,16 +356,14 @@ def iloc(self, n=None, column=None): def fillna( self, value=None, - method=None, axis=None, inplace=False, limit=None, - downcast=None, ): """ Fill NA/NaN values using the specified method. """ - result = self._data.fillna(value=value, method=method, axis=axis, limit=limit) + result = self._data.fillna(value=value, axis=axis, limit=limit) if inplace: self._data = result return None diff --git a/cdisc_rules_engine/models/dataset/dataset_interface.py b/cdisc_rules_engine/models/dataset/dataset_interface.py index 5854cef82..99a69450d 100644 --- a/cdisc_rules_engine/models/dataset/dataset_interface.py +++ b/cdisc_rules_engine/models/dataset/dataset_interface.py @@ -229,11 +229,9 @@ def reset_index(self, drop=False, **kwargs): def fillna( self, value=None, - method=None, axis=None, inplace=False, limit=None, - downcast=None, ): """ Fill NA/NaN values using the specified method. diff --git a/cdisc_rules_engine/models/dataset/pandas_dataset.py b/cdisc_rules_engine/models/dataset/pandas_dataset.py index 497120e3e..435c923ed 100644 --- a/cdisc_rules_engine/models/dataset/pandas_dataset.py +++ b/cdisc_rules_engine/models/dataset/pandas_dataset.py @@ -260,22 +260,18 @@ def reset_index(self, drop=False, **kwargs): def fillna( self, value=None, - method=None, axis=None, inplace=False, limit=None, - downcast=None, ): """ Fill NA/NaN values using the specified method. """ result = self._data.fillna( value=value, - method=method, axis=axis, inplace=inplace, limit=limit, - downcast=downcast, ) if inplace: return None From 4437044e94bd0ee6c08abd3618f25c76bb1e15af Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Fri, 22 May 2026 16:37:53 -0700 Subject: [PATCH 15/15] fix: replace Dask GroupBy path in Distinct (.apply(set) fails in pandas 3.0) --- cdisc_rules_engine/operations/distinct.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cdisc_rules_engine/operations/distinct.py b/cdisc_rules_engine/operations/distinct.py index a7485a4f6..b18b135dc 100644 --- a/cdisc_rules_engine/operations/distinct.py +++ b/cdisc_rules_engine/operations/distinct.py @@ -60,12 +60,20 @@ def get_existing_column_names(group): self._unique_values_for_column ) else: + # Dask path: groupby-apply produces only the aggregated result, + # matching the shape expected by _handle_grouped_result. + grouping = self.params.grouping + target = self.params.target result = ( - grouped.data[self.params.target] - .unique() - .rename({self.params.target: self.params.operation_id}) + result.data.groupby(grouping)[target] + .apply( + lambda col: set(col.dropna().unique()), + meta=pd.Series(dtype=object), + ) + .compute() + .to_frame(name=target) + .reset_index() ) - result = result.apply(set).to_frame().reset_index() return result def _get_referenced_datasets(self):