Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint-format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
python-version: "3.12"
- name: Install linters
run: |
pip install black flake8 -c requirements-dev.txt
pip install black flake8
- name: Run flake8
run: |
flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test-unit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ jobs:
python-version: "3.12"
- name: Install requirements
run: |
pip install -r requirements-dev.txt
pip install -e .
pip install -r requirements.txt
pip install --group dev -e .
- name: Running Tests
env:
CDISC_LIBRARY_API_KEY: fakekey12341234
Expand Down
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -646,11 +646,7 @@ These steps should be run before running any tests or core commands using the no

- Install the requirements:

```bash
python -m pip install -r requirements-dev.txt
```

Run this from the root directory.
`pip install -e . && pip install --group dev` # From the root directory

### Creating an executable version

Expand Down Expand Up @@ -724,7 +720,7 @@ py -m twine upload --repository {repository_name} dist/*

This project uses the `black` code formatter, `flake8` linter for python and `prettier` for JSON, YAML and MD.
It also uses `pre-commit` to run `black`, `flake8` and `prettier` when you commit.
Both dependencies are added to _requirements-dev.txt_.
Both dependencies are added to the `dev` dependency group in _pyproject.toml_.

Setting up `pre-commit` requires one extra step. After installing it you have to run:

Expand Down
52 changes: 29 additions & 23 deletions cdisc_rules_engine/check_operators/dataframe_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,8 @@ def _check_equality(
target_val = custom_str_conversion(target_val)
comparison_val = custom_str_conversion(comparison_val)
if case_insensitive:
target_val = target_val.lower() if target_val else None
comparison_val = comparison_val.lower() if comparison_val else None
target_val = target_val.lower() if isinstance(target_val, str) and target_val else None
comparison_val = comparison_val.lower() if isinstance(comparison_val, str) and comparison_val else None
return target_val == comparison_val
return target_val == comparison_val

Expand Down Expand Up @@ -275,8 +275,8 @@ def _check_inequality(
target_val = custom_str_conversion(target_val)
comparison_val = custom_str_conversion(comparison_val)
if case_insensitive:
target_val = target_val.lower() if target_val else None
comparison_val = comparison_val.lower() if comparison_val else None
target_val = target_val.lower() if isinstance(target_val, str) and target_val else None
comparison_val = comparison_val.lower() if isinstance(comparison_val, str) and comparison_val else None
return target_val != comparison_val
return target_val != comparison_val

Expand Down Expand Up @@ -698,6 +698,12 @@ def is_contained_by_case_insensitive(self, other_value):
def is_not_contained_by_case_insensitive(self, other_value):
return ~self.is_contained_by_case_insensitive(other_value)

@staticmethod
def _map_regex(series, func):
# pandas 3 returns nullable BooleanDtype from .map(); normalize to numpy
# bool so ~ and & behave identically for both positive and negated callers.
return series.map(func, na_action="ignore").fillna(False).astype(bool)

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
def prefix_matches_regex(self, other_value):
Expand All @@ -707,10 +713,10 @@ def prefix_matches_regex(self, other_value):
converted_strings = self.value[target].map(
lambda x: self._regex_str_conversion(x)
)
results = converted_strings.notna() & converted_strings.astype(str).map(
lambda x: re.search(comparator, x[:prefix]) is not None
return converted_strings.notna() & self._map_regex(
converted_strings.astype(str),
lambda x: re.search(comparator, x[:prefix]) is not None,
)
return results

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
Expand All @@ -721,10 +727,10 @@ def not_prefix_matches_regex(self, other_value):
converted_strings = self.value[target].map(
lambda x: self._regex_str_conversion(x)
)
results = converted_strings.notna() & ~converted_strings.astype(str).map(
lambda x: re.search(comparator, x[:prefix]) is not None
return converted_strings.notna() & ~self._map_regex(
converted_strings.astype(str),
lambda x: re.search(comparator, x[:prefix]) is not None,
)
return results

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
Expand All @@ -735,10 +741,10 @@ def suffix_matches_regex(self, other_value):
converted_strings = self.value[target].map(
lambda x: self._regex_str_conversion(x)
)
results = converted_strings.notna() & converted_strings.astype(str).map(
lambda x: re.search(comparator, x[-suffix:]) is not None
return converted_strings.notna() & self._map_regex(
converted_strings.astype(str),
lambda x: re.search(comparator, x[-suffix:]) is not None,
)
return results

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
Expand All @@ -749,10 +755,10 @@ def not_suffix_matches_regex(self, other_value):
converted_strings = self.value[target].map(
lambda x: self._regex_str_conversion(x)
)
results = converted_strings.notna() & ~converted_strings.astype(str).map(
lambda x: re.search(comparator, x[-suffix:]) is not None
return converted_strings.notna() & ~self._map_regex(
converted_strings.astype(str),
lambda x: re.search(comparator, x[-suffix:]) is not None,
)
return results

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
Expand All @@ -762,10 +768,10 @@ def matches_regex(self, other_value):
converted_strings = self.value[target].map(
lambda x: self._regex_str_conversion(x)
)
results = converted_strings.notna() & converted_strings.astype(str).str.match(
comparator
return converted_strings.notna() & self._map_regex(
converted_strings.astype(str),
lambda x: re.match(comparator, x) is not None,
)
return results

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
Expand All @@ -775,10 +781,10 @@ def not_matches_regex(self, other_value):
converted_strings = self.value[target].map(
lambda x: self._regex_str_conversion(x)
)
results = converted_strings.notna() & ~converted_strings.astype(str).str.match(
comparator
return converted_strings.notna() & ~self._map_regex(
converted_strings.astype(str),
lambda x: re.match(comparator, x) is not None,
)
return results

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
Expand Down Expand Up @@ -1499,7 +1505,7 @@ def check_inconsistency(row):
return df.apply(check_inconsistency, axis=1)

def next_column_exists_and_previous_is_null(self, row) -> bool:
row.reset_index(drop=True, inplace=True)
row = row.reset_index(drop=True)
for index in row[
row.isin(NULL_FLAVORS) | pd.isna(row)
].index: # leaving null values only
Expand Down
2 changes: 1 addition & 1 deletion cdisc_rules_engine/check_operators/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def default_value(self):


def is_valid_date(date_string: str) -> bool:
if date_string is None:
if not isinstance(date_string, str):
return False
try:
isoparse(date_string)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def build(self):
data_contents_with_vlm["variable_value_length"] = data_contents_with_vlm.data[
["variable_value", "define_vlm_data_type"]
].apply(
lambda row: self.calculate_variable_value_length(
lambda row: ValuesDatasetBuilder.calculate_variable_value_length(
row["variable_value"], row["define_vlm_data_type"]
),
axis=1,
Expand Down
18 changes: 12 additions & 6 deletions cdisc_rules_engine/models/dataset/dask_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,17 @@ def __setitem__(self, key, value):
array_values = da.from_array(value, chunks=tuple(chunks))
self._data[key] = array_values
elif isinstance(value, pd.Series):
self._data = self._data.reset_index()
self._data = self._data.set_index("index")
self._data[key] = value
if not isinstance(value.values, np.ndarray):
# Extension array (e.g. StringDtype): da.from_array() cannot handle it;
# materialize to pandas, assign positionally, and re-partition.
npartitions = self._data.npartitions
pandas_df = self._data.compute()
pandas_df[key] = value.values
self._data = dd.from_pandas(pandas_df, npartitions=npartitions)
else:
chunks = self._data.map_partitions(lambda x: len(x)).compute().to_numpy()
array_values = da.from_array(value.values, chunks=tuple(chunks))
self._data[key] = array_values
elif isinstance(value, dd.DataFrame):
for column in value:
self._data[column] = value[column]
Expand Down Expand Up @@ -348,16 +356,14 @@ def iloc(self, n=None, column=None):
def fillna(
self,
value=None,
method=None,
axis=None,
inplace=False,
limit=None,
downcast=None,
):
"""
Fill NA/NaN values using the specified method.
"""
result = self._data.fillna(value=value, method=method, axis=axis, limit=limit)
result = self._data.fillna(value=value, axis=axis, limit=limit)
if inplace:
self._data = result
return None
Expand Down
2 changes: 0 additions & 2 deletions cdisc_rules_engine/models/dataset/dataset_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,9 @@ def reset_index(self, drop=False, **kwargs):
def fillna(
self,
value=None,
method=None,
axis=None,
inplace=False,
limit=None,
downcast=None,
):
"""
Fill NA/NaN values using the specified method.
Expand Down
4 changes: 0 additions & 4 deletions cdisc_rules_engine/models/dataset/pandas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,22 +260,18 @@ def reset_index(self, drop=False, **kwargs):
def fillna(
self,
value=None,
method=None,
axis=None,
inplace=False,
limit=None,
downcast=None,
):
"""
Fill NA/NaN values using the specified method.
"""
result = self._data.fillna(
value=value,
method=method,
axis=axis,
inplace=inplace,
limit=limit,
downcast=downcast,
)
if inplace:
return None
Expand Down
16 changes: 12 additions & 4 deletions cdisc_rules_engine/operations/distinct.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,20 @@ def get_existing_column_names(group):
self._unique_values_for_column
)
else:
# Dask path: groupby-apply produces only the aggregated result,
# matching the shape expected by _handle_grouped_result.
grouping = self.params.grouping
target = self.params.target
result = (
grouped.data[self.params.target]
.unique()
.rename({self.params.target: self.params.operation_id})
result.data.groupby(grouping)[target]
.apply(
lambda col: set(col.dropna().unique()),
meta=pd.Series(dtype=object),
)
.compute()
.to_frame(name=target)
.reset_index()
)
result = result.apply(set).to_frame().reset_index()
return result

def _get_referenced_datasets(self):
Expand Down
2 changes: 1 addition & 1 deletion cdisc_rules_engine/operations/record_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def _build_effective_grouping(self) -> tuple[list, dict]:
if self.params.dataframe[col].isna().all():
all_na_cols[col] = None
elif (
self.params.dataframe[col].dtype == "object"
pd.api.types.is_string_dtype(self.params.dataframe[col])
and self.params.dataframe[col].fillna("").str.strip().eq("").all()
):
all_na_cols[col] = ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _raw_dataset_from_file(self, file_path) -> pd.DataFrame:
[item for item in datasetjson.get("rows", [])],
columns=[item["name"] for item in datasetjson.get("columns", [])],
)
return df.applymap(lambda x: round(x, 15) if isinstance(x, float) else x)
return df.map(lambda x: round(x, 15) if isinstance(x, float) else x)

def from_file(self, file_path):
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _raw_dataset_from_file(self, file_path) -> pd.DataFrame:
[item for item in datandjson],
columns=[item["name"] for item in metadatandjson.get("columns", [])],
)
return df.applymap(lambda x: round(x, 15) if isinstance(x, float) else x)
return df.map(lambda x: round(x, 15) if isinstance(x, float) else x)

def from_file(self, file_path):
try:
Expand Down
7 changes: 2 additions & 5 deletions cdisc_rules_engine/services/data_readers/parquet_reader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from io import BytesIO
from typing import Union

import pandas as pd
import dask.dataframe as dd
Expand Down Expand Up @@ -29,10 +28,8 @@ def from_file(self, file_path):
file_path
)

def _format_floats(
self, dataframe: Union[pd.DataFrame, dd.DataFrame]
) -> Union[pd.DataFrame, dd.DataFrame]:
return dataframe.applymap(lambda x: round(x, 15) if isinstance(x, float) else x)
def _format_floats(self, dataframe: pd.DataFrame) -> pd.DataFrame:
return dataframe.map(lambda x: round(x, 15) if isinstance(x, float) else x)

def _read_dask(self, file_path):
data = dd.read_parquet(file_path)
Expand Down
2 changes: 1 addition & 1 deletion cdisc_rules_engine/services/data_readers/xpt_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ def from_file(self, file_path):
return self._read_pandas(file_path)

def _format_floats(self, dataframe: pd.DataFrame) -> pd.DataFrame:
return dataframe.applymap(lambda x: round(x, 15) if isinstance(x, float) else x)
return dataframe.map(lambda x: round(x, 15) if isinstance(x, float) else x)
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def get_dataset(self, dataset_name: str, **params) -> PandasDataset:
dataset: Optional[DummyDataset] = self.get_dataset_data(dataset_name)
if dataset is not None:
df: pd.DataFrame = dataset.data
df = df.applymap(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)
df = df.map(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)
result = PandasDataset(df)
return result
else:
Expand Down
15 changes: 13 additions & 2 deletions cdisc_rules_engine/services/data_services/usdm_data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,19 @@ def __read_node_metadata(
}

@staticmethod
def __get_full_path(node: DatumInContext):
return f"{node.full_path}".replace(".[", "[")
def __get_full_path(node: DatumInContext) -> str:
parts = []
current = node
while current is not None and current.context is not None:
parts.append(str(current.path))
current = current.context
result = ""
for part in reversed(parts):
if part.startswith("["):
result += part
else:
result = (result + "." if result else "") + part
return result

def __get_datasets_content_index(self) -> List[dict]:
"""
Expand Down
2 changes: 1 addition & 1 deletion cdisc_rules_engine/services/datasetxpt_metadata_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def read(self) -> dict:
"variable_labels": list(metadata.column_labels),
"variable_names": list(metadata.column_names),
"variable_formats": [
"" if data_type == "NULL" else data_type
"" if (data_type == "NULL" or data_type is None) else data_type
for data_type in metadata.original_variable_types.values()
],
"variable_name_to_label_map": metadata.column_names_to_labels,
Expand Down
2 changes: 0 additions & 2 deletions core.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,15 +357,13 @@ def load_custom_dotenv_from_data_options(ctx, param, value):
"-s",
"--standard",
required=True,
default=None,
help="CDISC standard to validate against",
envvar="PRODUCT",
)
@click.option(
"-v",
"--version",
required=True,
default=None,
help="Standard version to validate against",
envvar="VERSION",
)
Expand Down
Loading