diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml index 4c7f70fe0..f455d0b78 100644 --- a/.github/workflows/lint-format.yml +++ b/.github/workflows/lint-format.yml @@ -49,7 +49,7 @@ jobs: python-version: "3.12" - name: Install linters run: | - pip install black flake8 -c requirements-dev.txt + pip install black flake8 - name: Run flake8 run: | flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics diff --git a/.github/workflows/test-unit.yml b/.github/workflows/test-unit.yml index 38a9e55ae..d11d78b30 100644 --- a/.github/workflows/test-unit.yml +++ b/.github/workflows/test-unit.yml @@ -16,8 +16,8 @@ jobs: python-version: "3.12" - name: Install requirements run: | - pip install -r requirements-dev.txt - pip install -e . + pip install -r requirements.txt + pip install --group dev -e . - name: Running Tests env: CDISC_LIBRARY_API_KEY: fakekey12341234 diff --git a/README.md b/README.md index 4340249dc..4b7806a43 100644 --- a/README.md +++ b/README.md @@ -646,11 +646,7 @@ These steps should be run before running any tests or core commands using the no - Install the requirements: - ```bash - python -m pip install -r requirements-dev.txt - ``` - - Run this from the root directory. + `pip install -e . && pip install --group dev` # From the root directory ### Creating an executable version @@ -724,7 +720,7 @@ py -m twine upload --repository {repository_name} dist/* This project uses the `black` code formatter, `flake8` linter for python and `prettier` for JSON, YAML and MD. It also uses `pre-commit` to run `black`, `flake8` and `prettier` when you commit. -Both dependencies are added to _requirements-dev.txt_. +Both dependencies are added to the `dev` dependency group in _pyproject.toml_. Setting up `pre-commit` requires one extra step. After installing it you have to run: diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 78e4f5a9f..1db51c152 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -228,8 +228,8 @@ def _check_equality( target_val = custom_str_conversion(target_val) comparison_val = custom_str_conversion(comparison_val) if case_insensitive: - target_val = target_val.lower() if target_val else None - comparison_val = comparison_val.lower() if comparison_val else None + target_val = target_val.lower() if isinstance(target_val, str) and target_val else None + comparison_val = comparison_val.lower() if isinstance(comparison_val, str) and comparison_val else None return target_val == comparison_val return target_val == comparison_val @@ -275,8 +275,8 @@ def _check_inequality( target_val = custom_str_conversion(target_val) comparison_val = custom_str_conversion(comparison_val) if case_insensitive: - target_val = target_val.lower() if target_val else None - comparison_val = comparison_val.lower() if comparison_val else None + target_val = target_val.lower() if isinstance(target_val, str) and target_val else None + comparison_val = comparison_val.lower() if isinstance(comparison_val, str) and comparison_val else None return target_val != comparison_val return target_val != comparison_val @@ -698,6 +698,12 @@ def is_contained_by_case_insensitive(self, other_value): def is_not_contained_by_case_insensitive(self, other_value): return ~self.is_contained_by_case_insensitive(other_value) + @staticmethod + def _map_regex(series, func): + # pandas 3 returns nullable BooleanDtype from .map(); normalize to numpy + # bool so ~ and & behave identically for both positive and negated callers. + return series.map(func, na_action="ignore").fillna(False).astype(bool) + @log_operator_execution @type_operator(FIELD_DATAFRAME) def prefix_matches_regex(self, other_value): @@ -707,10 +713,10 @@ def prefix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & converted_strings.astype(str).map( - lambda x: re.search(comparator, x[:prefix]) is not None + return converted_strings.notna() & self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[:prefix]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -721,10 +727,10 @@ def not_prefix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & ~converted_strings.astype(str).map( - lambda x: re.search(comparator, x[:prefix]) is not None + return converted_strings.notna() & ~self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[:prefix]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -735,10 +741,10 @@ def suffix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & converted_strings.astype(str).map( - lambda x: re.search(comparator, x[-suffix:]) is not None + return converted_strings.notna() & self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[-suffix:]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -749,10 +755,10 @@ def not_suffix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & ~converted_strings.astype(str).map( - lambda x: re.search(comparator, x[-suffix:]) is not None + return converted_strings.notna() & ~self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[-suffix:]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -762,10 +768,10 @@ def matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & converted_strings.astype(str).str.match( - comparator + return converted_strings.notna() & self._map_regex( + converted_strings.astype(str), + lambda x: re.match(comparator, x) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -775,10 +781,10 @@ def not_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & ~converted_strings.astype(str).str.match( - comparator + return converted_strings.notna() & ~self._map_regex( + converted_strings.astype(str), + lambda x: re.match(comparator, x) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -1499,7 +1505,7 @@ def check_inconsistency(row): return df.apply(check_inconsistency, axis=1) def next_column_exists_and_previous_is_null(self, row) -> bool: - row.reset_index(drop=True, inplace=True) + row = row.reset_index(drop=True) for index in row[ row.isin(NULL_FLAVORS) | pd.isna(row) ].index: # leaving null values only diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 4a9348b8e..caaae4089 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -56,7 +56,7 @@ def default_value(self): def is_valid_date(date_string: str) -> bool: - if date_string is None: + if not isinstance(date_string, str): return False try: isoparse(date_string) diff --git a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py index fb2374b9c..328a90e48 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py @@ -66,7 +66,7 @@ def build(self): data_contents_with_vlm["variable_value_length"] = data_contents_with_vlm.data[ ["variable_value", "define_vlm_data_type"] ].apply( - lambda row: self.calculate_variable_value_length( + lambda row: ValuesDatasetBuilder.calculate_variable_value_length( row["variable_value"], row["define_vlm_data_type"] ), axis=1, diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index 7a6449d31..08a8649cc 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -81,9 +81,17 @@ def __setitem__(self, key, value): array_values = da.from_array(value, chunks=tuple(chunks)) self._data[key] = array_values elif isinstance(value, pd.Series): - self._data = self._data.reset_index() - self._data = self._data.set_index("index") - self._data[key] = value + if not isinstance(value.values, np.ndarray): + # Extension array (e.g. StringDtype): da.from_array() cannot handle it; + # materialize to pandas, assign positionally, and re-partition. + npartitions = self._data.npartitions + pandas_df = self._data.compute() + pandas_df[key] = value.values + self._data = dd.from_pandas(pandas_df, npartitions=npartitions) + else: + chunks = self._data.map_partitions(lambda x: len(x)).compute().to_numpy() + array_values = da.from_array(value.values, chunks=tuple(chunks)) + self._data[key] = array_values elif isinstance(value, dd.DataFrame): for column in value: self._data[column] = value[column] @@ -348,16 +356,14 @@ def iloc(self, n=None, column=None): def fillna( self, value=None, - method=None, axis=None, inplace=False, limit=None, - downcast=None, ): """ Fill NA/NaN values using the specified method. """ - result = self._data.fillna(value=value, method=method, axis=axis, limit=limit) + result = self._data.fillna(value=value, axis=axis, limit=limit) if inplace: self._data = result return None diff --git a/cdisc_rules_engine/models/dataset/dataset_interface.py b/cdisc_rules_engine/models/dataset/dataset_interface.py index 5854cef82..99a69450d 100644 --- a/cdisc_rules_engine/models/dataset/dataset_interface.py +++ b/cdisc_rules_engine/models/dataset/dataset_interface.py @@ -229,11 +229,9 @@ def reset_index(self, drop=False, **kwargs): def fillna( self, value=None, - method=None, axis=None, inplace=False, limit=None, - downcast=None, ): """ Fill NA/NaN values using the specified method. diff --git a/cdisc_rules_engine/models/dataset/pandas_dataset.py b/cdisc_rules_engine/models/dataset/pandas_dataset.py index 497120e3e..435c923ed 100644 --- a/cdisc_rules_engine/models/dataset/pandas_dataset.py +++ b/cdisc_rules_engine/models/dataset/pandas_dataset.py @@ -260,22 +260,18 @@ def reset_index(self, drop=False, **kwargs): def fillna( self, value=None, - method=None, axis=None, inplace=False, limit=None, - downcast=None, ): """ Fill NA/NaN values using the specified method. """ result = self._data.fillna( value=value, - method=method, axis=axis, inplace=inplace, limit=limit, - downcast=downcast, ) if inplace: return None diff --git a/cdisc_rules_engine/operations/distinct.py b/cdisc_rules_engine/operations/distinct.py index a7485a4f6..b18b135dc 100644 --- a/cdisc_rules_engine/operations/distinct.py +++ b/cdisc_rules_engine/operations/distinct.py @@ -60,12 +60,20 @@ def get_existing_column_names(group): self._unique_values_for_column ) else: + # Dask path: groupby-apply produces only the aggregated result, + # matching the shape expected by _handle_grouped_result. + grouping = self.params.grouping + target = self.params.target result = ( - grouped.data[self.params.target] - .unique() - .rename({self.params.target: self.params.operation_id}) + result.data.groupby(grouping)[target] + .apply( + lambda col: set(col.dropna().unique()), + meta=pd.Series(dtype=object), + ) + .compute() + .to_frame(name=target) + .reset_index() ) - result = result.apply(set).to_frame().reset_index() return result def _get_referenced_datasets(self): diff --git a/cdisc_rules_engine/operations/record_count.py b/cdisc_rules_engine/operations/record_count.py index 017b260f1..aa4579d08 100644 --- a/cdisc_rules_engine/operations/record_count.py +++ b/cdisc_rules_engine/operations/record_count.py @@ -169,7 +169,7 @@ def _build_effective_grouping(self) -> tuple[list, dict]: if self.params.dataframe[col].isna().all(): all_na_cols[col] = None elif ( - self.params.dataframe[col].dtype == "object" + pd.api.types.is_string_dtype(self.params.dataframe[col]) and self.params.dataframe[col].fillna("").str.strip().eq("").all() ): all_na_cols[col] = "" diff --git a/cdisc_rules_engine/services/data_readers/dataset_json_reader.py b/cdisc_rules_engine/services/data_readers/dataset_json_reader.py index 71e312528..d95824a3c 100644 --- a/cdisc_rules_engine/services/data_readers/dataset_json_reader.py +++ b/cdisc_rules_engine/services/data_readers/dataset_json_reader.py @@ -36,7 +36,7 @@ def _raw_dataset_from_file(self, file_path) -> pd.DataFrame: [item for item in datasetjson.get("rows", [])], columns=[item["name"] for item in datasetjson.get("columns", [])], ) - return df.applymap(lambda x: round(x, 15) if isinstance(x, float) else x) + return df.map(lambda x: round(x, 15) if isinstance(x, float) else x) def from_file(self, file_path): try: diff --git a/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py b/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py index 48b998e40..c2652343e 100644 --- a/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py +++ b/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py @@ -45,7 +45,7 @@ def _raw_dataset_from_file(self, file_path) -> pd.DataFrame: [item for item in datandjson], columns=[item["name"] for item in metadatandjson.get("columns", [])], ) - return df.applymap(lambda x: round(x, 15) if isinstance(x, float) else x) + return df.map(lambda x: round(x, 15) if isinstance(x, float) else x) def from_file(self, file_path): try: diff --git a/cdisc_rules_engine/services/data_readers/parquet_reader.py b/cdisc_rules_engine/services/data_readers/parquet_reader.py index 6e7867b63..953c246d4 100644 --- a/cdisc_rules_engine/services/data_readers/parquet_reader.py +++ b/cdisc_rules_engine/services/data_readers/parquet_reader.py @@ -1,5 +1,4 @@ from io import BytesIO -from typing import Union import pandas as pd import dask.dataframe as dd @@ -29,10 +28,8 @@ def from_file(self, file_path): file_path ) - def _format_floats( - self, dataframe: Union[pd.DataFrame, dd.DataFrame] - ) -> Union[pd.DataFrame, dd.DataFrame]: - return dataframe.applymap(lambda x: round(x, 15) if isinstance(x, float) else x) + def _format_floats(self, dataframe: pd.DataFrame) -> pd.DataFrame: + return dataframe.map(lambda x: round(x, 15) if isinstance(x, float) else x) def _read_dask(self, file_path): data = dd.read_parquet(file_path) diff --git a/cdisc_rules_engine/services/data_readers/xpt_reader.py b/cdisc_rules_engine/services/data_readers/xpt_reader.py index d20e1e85d..f2068bedc 100644 --- a/cdisc_rules_engine/services/data_readers/xpt_reader.py +++ b/cdisc_rules_engine/services/data_readers/xpt_reader.py @@ -50,4 +50,4 @@ def from_file(self, file_path): return self._read_pandas(file_path) def _format_floats(self, dataframe: pd.DataFrame) -> pd.DataFrame: - return dataframe.applymap(lambda x: round(x, 15) if isinstance(x, float) else x) + return dataframe.map(lambda x: round(x, 15) if isinstance(x, float) else x) diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index a3a98b978..2f846217d 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -63,7 +63,7 @@ def get_dataset(self, dataset_name: str, **params) -> PandasDataset: dataset: Optional[DummyDataset] = self.get_dataset_data(dataset_name) if dataset is not None: df: pd.DataFrame = dataset.data - df = df.applymap(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x) + df = df.map(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x) result = PandasDataset(df) return result else: diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 83052f2e6..5b3cc6a99 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -417,8 +417,19 @@ def __read_node_metadata( } @staticmethod - def __get_full_path(node: DatumInContext): - return f"{node.full_path}".replace(".[", "[") + def __get_full_path(node: DatumInContext) -> str: + parts = [] + current = node + while current is not None and current.context is not None: + parts.append(str(current.path)) + current = current.context + result = "" + for part in reversed(parts): + if part.startswith("["): + result += part + else: + result = (result + "." if result else "") + part + return result def __get_datasets_content_index(self) -> List[dict]: """ diff --git a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py index 02c1a5ff1..dd68ccabb 100644 --- a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py @@ -61,7 +61,7 @@ def read(self) -> dict: "variable_labels": list(metadata.column_labels), "variable_names": list(metadata.column_names), "variable_formats": [ - "" if data_type == "NULL" else data_type + "" if (data_type == "NULL" or data_type is None) else data_type for data_type in metadata.original_variable_types.values() ], "variable_name_to_label_map": metadata.column_names_to_labels, diff --git a/core.py b/core.py index ea1ff2ce6..8b162a66b 100644 --- a/core.py +++ b/core.py @@ -357,7 +357,6 @@ def load_custom_dotenv_from_data_options(ctx, param, value): "-s", "--standard", required=True, - default=None, help="CDISC standard to validate against", envvar="PRODUCT", ) @@ -365,7 +364,6 @@ def load_custom_dotenv_from_data_options(ctx, param, value): "-v", "--version", required=True, - default=None, help="Standard version to validate against", envvar="VERSION", ) diff --git a/pyproject.toml b/pyproject.toml index d0da6d416..8c2c7566d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,50 @@ build-backend = "setuptools.build_meta" [project] name = "cdisc-rules-engine" -dynamic = ["version", "dependencies"] +dynamic = ["version"] description = "Open source offering of the cdisc rules engine" readme = "PYPI.md" requires-python = ">=3.12, <3.13" license = { text = "MIT" } authors = [{ name = "cdisc-org", email = "info@cdisc.org" }] +dependencies = [ + "business_rules_enhanced >=1.4.8", + "cachetools >=6.1.0", + "cdisc-library-client >=0.1.6", + "click >=8.1.7", + "dask[dataframe,array] >=2024.6.0", + "fastparquet >=2024.2.0", + "importlib-metadata >=8.5.0", + "jsonata-python >=0.6.0", + "jsonpath-ng >=1.6.1", + "jsonschema >=4.18.5", + "lxml >=5.2.1", + "numpy >=1.26.0", + "odmlib >=0.1.4", + "openpyxl >=3.1.5", + "pandas >=2.1.4", + "psutil >=6.1.1", + "pytz >=2020.1", + "pyinstaller >=6.11.0", + "pympler >=1.1", + "pyreadstat >=1.2.7", + "python-dotenv >=1.0.0", + "pyyaml >=6.0.2", + "redis >=4.5.0", + "requests >=2.32.3", + "setuptools >=75.6.0", + "titlecase >=2.4.1", +] + +[dependency-groups] +dev = [ + "black >=24.10.0", + "flake8 >=6.1.0", + "pre-commit >=2.20.0", + "pytest >=7.4.0, <8.0.0", + "pytest-asyncio >=0.21.0", + "pytest-cov >=6.0.0", +] [project.urls] "Homepage" = "https://github.com/cdisc-org/cdisc-rules-engine" @@ -26,5 +64,4 @@ include-package-data = true py-modules = ["version"] [tool.setuptools.dynamic] -version = { attr = "version.__version__" } -dependencies = {file = ["requirements.txt"]} \ No newline at end of file +version = { attr = "version.__version__" } \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index ac709f651..000000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,7 +0,0 @@ --r requirements.txt -black==24.10.0 -flake8==6.1.0 -pre-commit==2.20.0 -pytest==7.4.0 -pytest-asyncio==0.21.0 -pytest-cov==6.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4481051f6..9ba35e062 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# Lockfile: exact pinned versions for reproducible installs. +# Dependency constraints are defined in pyproject.toml. business_rules_enhanced==1.4.8 cachetools==6.1.0 cdisc-library-client==0.1.6 @@ -19,6 +21,7 @@ pyinstaller==6.11.0 Pympler==1.1 pyreadstat==1.2.7 python-dotenv==1.0.0 +pytz==2026.2 pyyaml==6.0.2 redis==4.5.0 requests~=2.32.3 diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index aff6c25e8..350c2e8dc 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -149,7 +149,7 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): expected_results["dm.xpt"], expected_results["ae.xpt"], ] - ).astype(object) + ).astype(object).sort_values("dataset_location").reset_index(drop=True) result_df = result.data[expected_df.columns].reset_index(drop=True)