From b1eaee894035d1e3eaab154cf9f5183bb55409c1 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Wed, 20 May 2026 11:06:36 -0700 Subject: [PATCH] fix: handle pandas 3.0 default StringDtype --- .../check_operators/dataframe_operators.py | 50 +++++++++++-------- cdisc_rules_engine/check_operators/helpers.py | 2 +- cdisc_rules_engine/operations/record_count.py | 2 +- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 7f0c94e43..417289858 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -228,8 +228,8 @@ def _check_equality( target_val = custom_str_conversion(target_val) comparison_val = custom_str_conversion(comparison_val) if case_insensitive: - target_val = target_val.lower() if target_val else None - comparison_val = comparison_val.lower() if comparison_val else None + target_val = target_val.lower() if isinstance(target_val, str) and target_val else None + comparison_val = comparison_val.lower() if isinstance(comparison_val, str) and comparison_val else None return target_val == comparison_val return target_val == comparison_val @@ -275,8 +275,8 @@ def _check_inequality( target_val = custom_str_conversion(target_val) comparison_val = custom_str_conversion(comparison_val) if case_insensitive: - target_val = target_val.lower() if target_val else None - comparison_val = comparison_val.lower() if comparison_val else None + target_val = target_val.lower() if isinstance(target_val, str) and target_val else None + comparison_val = comparison_val.lower() if isinstance(comparison_val, str) and comparison_val else None return target_val != comparison_val return target_val != comparison_val @@ -696,6 +696,12 @@ def is_contained_by_case_insensitive(self, other_value): def is_not_contained_by_case_insensitive(self, other_value): return ~self.is_contained_by_case_insensitive(other_value) + @staticmethod + def _map_regex(series, func): + # pandas 3 returns nullable BooleanDtype from .map(); normalize to numpy + # bool so ~ and & behave identically for both positive and negated callers. + return series.map(func, na_action="ignore").fillna(False).astype(bool) + @log_operator_execution @type_operator(FIELD_DATAFRAME) def prefix_matches_regex(self, other_value): @@ -705,10 +711,10 @@ def prefix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & converted_strings.astype(str).map( - lambda x: re.search(comparator, x[:prefix]) is not None + return converted_strings.notna() & self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[:prefix]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -719,10 +725,10 @@ def not_prefix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & ~converted_strings.astype(str).map( - lambda x: re.search(comparator, x[:prefix]) is not None + return converted_strings.notna() & ~self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[:prefix]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -733,10 +739,10 @@ def suffix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & converted_strings.astype(str).map( - lambda x: re.search(comparator, x[-suffix:]) is not None + return converted_strings.notna() & self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[-suffix:]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -747,10 +753,10 @@ def not_suffix_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & ~converted_strings.astype(str).map( - lambda x: re.search(comparator, x[-suffix:]) is not None + return converted_strings.notna() & ~self._map_regex( + converted_strings.astype(str), + lambda x: re.search(comparator, x[-suffix:]) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -760,10 +766,10 @@ def matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & converted_strings.astype(str).str.match( - comparator + return converted_strings.notna() & self._map_regex( + converted_strings.astype(str), + lambda x: re.match(comparator, x) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -773,10 +779,10 @@ def not_matches_regex(self, other_value): converted_strings = self.value[target].map( lambda x: self._regex_str_conversion(x) ) - results = converted_strings.notna() & ~converted_strings.astype(str).str.match( - comparator + return converted_strings.notna() & ~self._map_regex( + converted_strings.astype(str), + lambda x: re.match(comparator, x) is not None, ) - return results @log_operator_execution @type_operator(FIELD_DATAFRAME) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index ca3e9f888..0816380b0 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -56,7 +56,7 @@ def default_value(self): def is_valid_date(date_string: str) -> bool: - if date_string is None or not isinstance(date_string, str): + if not isinstance(date_string, str): return False try: isoparse(date_string) diff --git a/cdisc_rules_engine/operations/record_count.py b/cdisc_rules_engine/operations/record_count.py index 017b260f1..aa4579d08 100644 --- a/cdisc_rules_engine/operations/record_count.py +++ b/cdisc_rules_engine/operations/record_count.py @@ -169,7 +169,7 @@ def _build_effective_grouping(self) -> tuple[list, dict]: if self.params.dataframe[col].isna().all(): all_na_cols[col] = None elif ( - self.params.dataframe[col].dtype == "object" + pd.api.types.is_string_dtype(self.params.dataframe[col]) and self.params.dataframe[col].fillna("").str.strip().eq("").all() ): all_na_cols[col] = ""