Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 63 additions & 12 deletions cdisc_rules_engine/check_operators/dataframe_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1664,12 +1664,12 @@ def _verify_neighbor_consistency(

return is_valid

def check_target_ascending_in_sorted_group(
def check_target_ascending_in_sorted_group_with_regex(
self, group, target, comparator, ascending, na_pos
):
"""
Check if target values are in ascending order within a group
already sorted by comparator.
already sorted by comparator. Supports regex extraction.
"""
is_valid = pd.Series(True, index=group.index)
is_numeric_comparator = pd.api.types.is_numeric_dtype(group[comparator])
Expand Down Expand Up @@ -1798,12 +1798,45 @@ def _process_grouped_result(
grouped_result = pd.Series(result_list, index=index_list)
return grouped_result.reindex(sorted_df.index, fill_value=True)

def _extract_regex_group(self, series: pd.Series, regex_pattern: str) -> pd.Series:
"""
Extract the first capturing group from a regex pattern and convert to numeric if possible.
Handles zero-padded numbers by converting to numeric.

Args:
series: Pandas series with string values
regex_pattern: Regex pattern with capturing group(s)

Returns:
Series with extracted and converted values
"""

def extract_and_convert(value):
if pd.isna(value) or value == "":
return np.nan

# YAML escapes backslashes, so we receive ".*\\d+$" which Python interprets as raw \
# We need to convert this to the actual regex pattern by replacing \\ with \
# However, since strings from YAML come already unescaped, we just use as-is
match = re.search(regex_pattern, str(value))
if match and match.groups():
extracted = match.group(1) # First capturing group
# Try to convert to numeric to handle both padded and non-padded numbers
try:
return pd.to_numeric(extracted)
except (ValueError, TypeError):
return extracted
return np.nan

return series.apply(extract_and_convert)

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
def target_is_sorted_by(self, other_value: dict):
target = other_value.get("target")
within_columns = self._normalize_grouping_columns(other_value.get("within"))
columns = other_value["comparator"]
target_regex = other_value.get("regex") # parameter for regex extraction

result = pd.Series([True] * len(self.value), index=self.value.index)

Expand All @@ -1816,37 +1849,55 @@ def target_is_sorted_by(self, other_value: dict):
dict.fromkeys([target, comparator, *within_columns])
)

sorted_df = self.value[selected_columns].sort_values(
by=[*within_columns, target],
ascending=[True] * (len(within_columns) + 1),
)
# If regex is provided, extract and convert target values
if target_regex:
working_df = self.value[selected_columns].copy()
# Create a temporary column with extracted regex values
working_df[f"{target}_extracted"] = self._extract_regex_group(
working_df[target], target_regex
)
target_for_sorting = f"{target}_extracted"
# Sort by within columns only, preserve original order within groups
sorted_df = working_df.sort_values(
by=within_columns,

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here we would need to sort by within_columns and extracted target not preserve the order for regex branch.

ascending=[True] * len(within_columns),
)
else:
working_df = self.value[selected_columns]
target_for_sorting = target
sorted_df = working_df.sort_values(
by=[*within_columns, target],
ascending=[True] * (len(within_columns) + 1),
)

grouped_df = sorted_df.groupby(within_columns, sort=False)

target_check = grouped_df.apply(
lambda x: self.check_target_ascending_in_sorted_group(
x, target, comparator, ascending, na_pos
lambda x: self.check_target_ascending_in_sorted_group_with_regex(
x, target_for_sorting, comparator, ascending, na_pos
)
)
target_check = self._process_grouped_result(
target_check,
grouped_df,
within_columns,
sorted_df,
lambda group: self.check_target_ascending_in_sorted_group(
group, target, comparator, ascending, na_pos
lambda group: self.check_target_ascending_in_sorted_group_with_regex(
group, target_for_sorting, comparator, ascending, na_pos
),
)

date_overlap_check = grouped_df.apply(
lambda x: self.check_date_overlaps(x, target, comparator)
lambda x: self.check_date_overlaps(x, target_for_sorting, comparator)
)
date_overlap_check = self._process_grouped_result(
date_overlap_check,
grouped_df,
within_columns,
sorted_df,
lambda group: self.check_date_overlaps(group, target, comparator),
lambda group: self.check_date_overlaps(
group, target_for_sorting, comparator
),
)

combined_check = target_check & date_overlap_check
Expand Down
Loading
Loading