Skip to content

Commit ec427d9

Browse files
alexfurmenkovgithub-actionsRamilCDISCSFJohnson24
authored
702: Add regex support for target sorting in target_is_sorted_by operator (#1705)
* Add regex support for target sorting in target_is_sorted_by operator * Remove the deprecated function * Enhance target_is_sorted_by operator documentation with regex parameter details * Update merged schema files with markdown descriptions * Fix formatting of regex comment in target_is_sorted_by operator documentation * Update merged schema files with markdown descriptions * Enhance target sorting in target_is_sorted_by operator to include extracted target values --------- Co-authored-by: github-actions <github-actions@cdisc.org> Co-authored-by: RamilCDISC <113539111+RamilCDISC@users.noreply.github.com> Co-authored-by: Samuel Johnson <96841389+SFJohnson24@users.noreply.github.com>
1 parent 32dc824 commit ec427d9

4 files changed

Lines changed: 281 additions & 14 deletions

File tree

cdisc_rules_engine/check_operators/dataframe_operators.py

Lines changed: 63 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1664,12 +1664,12 @@ def _verify_neighbor_consistency(
16641664

16651665
return is_valid
16661666

1667-
def check_target_ascending_in_sorted_group(
1667+
def check_target_ascending_in_sorted_group_with_regex(
16681668
self, group, target, comparator, ascending, na_pos
16691669
):
16701670
"""
16711671
Check if target values are in ascending order within a group
1672-
already sorted by comparator.
1672+
already sorted by comparator. Supports regex extraction.
16731673
"""
16741674
is_valid = pd.Series(True, index=group.index)
16751675
is_numeric_comparator = pd.api.types.is_numeric_dtype(group[comparator])
@@ -1798,12 +1798,45 @@ def _process_grouped_result(
17981798
grouped_result = pd.Series(result_list, index=index_list)
17991799
return grouped_result.reindex(sorted_df.index, fill_value=True)
18001800

1801+
def _extract_regex_group(self, series: pd.Series, regex_pattern: str) -> pd.Series:
1802+
"""
1803+
Extract the first capturing group from a regex pattern and convert to numeric if possible.
1804+
Handles zero-padded numbers by converting to numeric.
1805+
1806+
Args:
1807+
series: Pandas series with string values
1808+
regex_pattern: Regex pattern with capturing group(s)
1809+
1810+
Returns:
1811+
Series with extracted and converted values
1812+
"""
1813+
1814+
def extract_and_convert(value):
1815+
if pd.isna(value) or value == "":
1816+
return np.nan
1817+
1818+
# YAML escapes backslashes, so we receive ".*\\d+$" which Python interprets as raw \
1819+
# We need to convert this to the actual regex pattern by replacing \\ with \
1820+
# However, since strings from YAML come already unescaped, we just use as-is
1821+
match = re.search(regex_pattern, str(value))
1822+
if match and match.groups():
1823+
extracted = match.group(1) # First capturing group
1824+
# Try to convert to numeric to handle both padded and non-padded numbers
1825+
try:
1826+
return pd.to_numeric(extracted)
1827+
except (ValueError, TypeError):
1828+
return extracted
1829+
return np.nan
1830+
1831+
return series.apply(extract_and_convert)
1832+
18011833
@log_operator_execution
18021834
@type_operator(FIELD_DATAFRAME)
18031835
def target_is_sorted_by(self, other_value: dict):
18041836
target = other_value.get("target")
18051837
within_columns = self._normalize_grouping_columns(other_value.get("within"))
18061838
columns = other_value["comparator"]
1839+
target_regex = other_value.get("regex") # parameter for regex extraction
18071840

18081841
result = pd.Series([True] * len(self.value), index=self.value.index)
18091842

@@ -1816,37 +1849,55 @@ def target_is_sorted_by(self, other_value: dict):
18161849
dict.fromkeys([target, comparator, *within_columns])
18171850
)
18181851

1819-
sorted_df = self.value[selected_columns].sort_values(
1820-
by=[*within_columns, target],
1821-
ascending=[True] * (len(within_columns) + 1),
1822-
)
1852+
# If regex is provided, extract and convert target values
1853+
if target_regex:
1854+
working_df = self.value[selected_columns].copy()
1855+
# Create a temporary column with extracted regex values
1856+
working_df[f"{target}_extracted"] = self._extract_regex_group(
1857+
working_df[target], target_regex
1858+
)
1859+
target_for_sorting = f"{target}_extracted"
1860+
# Sort by within columns AND extracted target
1861+
sorted_df = working_df.sort_values(
1862+
by=[*within_columns, target_for_sorting],
1863+
ascending=[True] * (len(within_columns) + 1),
1864+
)
1865+
else:
1866+
working_df = self.value[selected_columns]
1867+
target_for_sorting = target
1868+
sorted_df = working_df.sort_values(
1869+
by=[*within_columns, target],
1870+
ascending=[True] * (len(within_columns) + 1),
1871+
)
18231872

18241873
grouped_df = sorted_df.groupby(within_columns, sort=False)
18251874

18261875
target_check = grouped_df.apply(
1827-
lambda x: self.check_target_ascending_in_sorted_group(
1828-
x, target, comparator, ascending, na_pos
1876+
lambda x: self.check_target_ascending_in_sorted_group_with_regex(
1877+
x, target_for_sorting, comparator, ascending, na_pos
18291878
)
18301879
)
18311880
target_check = self._process_grouped_result(
18321881
target_check,
18331882
grouped_df,
18341883
within_columns,
18351884
sorted_df,
1836-
lambda group: self.check_target_ascending_in_sorted_group(
1837-
group, target, comparator, ascending, na_pos
1885+
lambda group: self.check_target_ascending_in_sorted_group_with_regex(
1886+
group, target_for_sorting, comparator, ascending, na_pos
18381887
),
18391888
)
18401889

18411890
date_overlap_check = grouped_df.apply(
1842-
lambda x: self.check_date_overlaps(x, target, comparator)
1891+
lambda x: self.check_date_overlaps(x, target_for_sorting, comparator)
18431892
)
18441893
date_overlap_check = self._process_grouped_result(
18451894
date_overlap_check,
18461895
grouped_df,
18471896
within_columns,
18481897
sorted_df,
1849-
lambda group: self.check_date_overlaps(group, target, comparator),
1898+
lambda group: self.check_date_overlaps(
1899+
group, target_for_sorting, comparator
1900+
),
18501901
)
18511902

18521903
combined_check = target_check & date_overlap_check

resources/schema/rule-merged/Operator.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,7 @@
850850
"properties": {
851851
"operator": {
852852
"const": "target_is_sorted_by",
853-
"markdownDescription": "\nTrue if the values in name are ordered according to the values specified by value in ascending/descending order, grouped by the values in within. Each value entry requires a variable name, a sort_order of asc or desc, and an optional null_position of first or last (defaults to last) which controls where null/empty comparator values are placed in the expected ordering. Within accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as their order cannot be inferred.\n\n```yaml\nCheck:\n all:\n - name: --SEQ\n within:\n - USUBJID\n - MIDSTYPE\n operator: target_is_sorted_by\n value:\n - name: --STDTC\n sort_order: asc\n null_position: last\n```\n"
853+
"markdownDescription": "\nTrue if the values in name are ordered according to the values specified by value\nin ascending/descending order, grouped by the values in within. Each value entry\nrequires a variable name, a sort_order of asc or desc, and an optional\nnull_position of first or last (defaults to last) which controls where null/empty\ncomparator values are placed in the expected ordering. Within accepts either a\nsingle column or an ordered list of columns. Columns can be either number or Char\nDates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that\noverlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as\ntheir order cannot be inferred.\n\nOptionally supports a `regex` parameter that extracts a portion of the target\nvalue for sorting. The regex must contain at least one capturing group. The first\ncaptured group is extracted and converted to numeric if possible, allowing proper\nsorting of sequence numbers (e.g., \"MIDS1\", \"MIDS2\", ..., \"MIDS10\" with regex\n`.*?(\\\\d+)$`). This is particularly useful for variables that end with sequence\nnumbers that may or may not be zero-padded.\n\n```yaml\nCheck:\n all:\n - name: --SEQ\n within:\n - USUBJID\n - MIDSTYPE\n operator: target_is_sorted_by\n value:\n - name: --STDTC\n sort_order: asc\n null_position: last\n```\n\nExample with regex for extracting sequence numbers:\n\n```yaml\nCheck:\n all:\n - name: MIDS\n operator: target_is_sorted_by\n regex: \".*?(\\\\d+)$\" # Extract trailing digits, convert to numeric\n value:\n - name: SMSTDTC\n sort_order: asc\n within:\n - USUBJID\n - MIDSTYPE\n```\n"
854854
}
855855
},
856856
"required": ["operator", "value", "within"],

resources/schema/rule/Operator.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1127,7 +1127,22 @@ Complement of `is_ordered_by`
11271127

11281128
### target_is_sorted_by
11291129

1130-
True if the values in name are ordered according to the values specified by value in ascending/descending order, grouped by the values in within. Each value entry requires a variable name, a sort_order of asc or desc, and an optional null_position of first or last (defaults to last) which controls where null/empty comparator values are placed in the expected ordering. Within accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as their order cannot be inferred.
1130+
True if the values in name are ordered according to the values specified by value
1131+
in ascending/descending order, grouped by the values in within. Each value entry
1132+
requires a variable name, a sort_order of asc or desc, and an optional
1133+
null_position of first or last (defaults to last) which controls where null/empty
1134+
comparator values are placed in the expected ordering. Within accepts either a
1135+
single column or an ordered list of columns. Columns can be either number or Char
1136+
Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that
1137+
overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as
1138+
their order cannot be inferred.
1139+
1140+
Optionally supports a `regex` parameter that extracts a portion of the target
1141+
value for sorting. The regex must contain at least one capturing group. The first
1142+
captured group is extracted and converted to numeric if possible, allowing proper
1143+
sorting of sequence numbers (e.g., "MIDS1", "MIDS2", ..., "MIDS10" with regex
1144+
`.*?(\\d+)$`). This is particularly useful for variables that end with sequence
1145+
numbers that may or may not be zero-padded.
11311146

11321147
```yaml
11331148
Check:
@@ -1143,6 +1158,22 @@ Check:
11431158
null_position: last
11441159
```
11451160

1161+
Example with regex for extracting sequence numbers:
1162+
1163+
```yaml
1164+
Check:
1165+
all:
1166+
- name: MIDS
1167+
operator: target_is_sorted_by
1168+
regex: ".*?(\\d+)$" # Extract trailing digits, convert to numeric
1169+
value:
1170+
- name: SMSTDTC
1171+
sort_order: asc
1172+
within:
1173+
- USUBJID
1174+
- MIDSTYPE
1175+
```
1176+
11461177
### target_is_not_sorted_by
11471178

11481179
Complement of `target_is_sorted_by`
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
"""
2+
Tests for target_is_sorted_by operator with regex support
3+
"""
4+
5+
import pytest
6+
import pandas as pd
7+
from cdisc_rules_engine.check_operators.dataframe_operators import DataframeType
8+
from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset
9+
from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset
10+
11+
12+
@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
13+
def test_target_is_sorted_by_with_regex_non_padded(dataset_class):
14+
"""
15+
Test target_is_sorted_by with regex extraction for non-zero-padded sequence numbers.
16+
Example: lalala1, lalala2, ..., lalala9, lalala10
17+
"""
18+
df = dataset_class.from_dict(
19+
{
20+
"USUBJID": ["001", "001", "001", "001", "002", "002", "002"],
21+
"MIDSTYPE": ["A", "A", "A", "A", "B", "B", "B"],
22+
"MIDS": [
23+
"lalala1",
24+
"lalala2",
25+
"lalala9",
26+
"lalala10",
27+
"test1",
28+
"test2",
29+
"test10",
30+
],
31+
"SMSTDTC": [
32+
"2020-01-01",
33+
"2020-01-02",
34+
"2020-01-09",
35+
"2020-01-10",
36+
"2020-02-01",
37+
"2020-02-02",
38+
"2020-02-10",
39+
],
40+
}
41+
)
42+
43+
other_value = {
44+
"target": "MIDS",
45+
"regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers
46+
"within": ["USUBJID", "MIDSTYPE"],
47+
"comparator": [
48+
{"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"}
49+
],
50+
}
51+
52+
result = DataframeType({"value": df}).target_is_sorted_by(other_value)
53+
# All should be True - sorted correctly by chronological order
54+
assert result.equals(pd.Series([True, True, True, True, True, True, True]))
55+
56+
57+
@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
58+
def test_target_is_sorted_by_with_regex_zero_padded(dataset_class):
59+
"""
60+
Test target_is_sorted_by with regex extraction for zero-padded sequence numbers.
61+
Example: lalala01, lalala02, ..., lalala09, lalala10
62+
"""
63+
df = dataset_class.from_dict(
64+
{
65+
"USUBJID": ["001", "001", "001", "001"],
66+
"MIDSTYPE": ["A", "A", "A", "A"],
67+
"MIDS": ["lalala01", "lalala02", "lalala09", "lalala10"],
68+
"SMSTDTC": [
69+
"2020-01-01",
70+
"2020-01-02",
71+
"2020-01-09",
72+
"2020-01-10",
73+
],
74+
}
75+
)
76+
77+
other_value = {
78+
"target": "MIDS",
79+
"regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers
80+
"within": ["USUBJID", "MIDSTYPE"],
81+
"comparator": [
82+
{"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"}
83+
],
84+
}
85+
86+
result = DataframeType({"value": df}).target_is_sorted_by(other_value)
87+
# All should be True - numeric conversion handles zero-padding
88+
assert result.equals(pd.Series([True, True, True, True]))
89+
90+
91+
@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
92+
def test_target_is_sorted_by_with_regex_invalid_order(dataset_class):
93+
"""
94+
Test that invalid order is detected even with regex extraction.
95+
"""
96+
df = dataset_class.from_dict(
97+
{
98+
"USUBJID": ["001", "001", "001", "001"],
99+
"MIDSTYPE": ["A", "A", "A", "A"],
100+
"MIDS": ["lalala1", "lalala10", "lalala2", "lalala9"],
101+
"SMSTDTC": [
102+
"2020-01-01",
103+
"2020-01-02",
104+
"2020-01-09",
105+
"2020-01-10",
106+
],
107+
}
108+
)
109+
110+
other_value = {
111+
"target": "MIDS",
112+
"regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers
113+
"within": ["USUBJID", "MIDSTYPE"],
114+
"comparator": [
115+
{"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"}
116+
],
117+
}
118+
119+
result = DataframeType({"value": df}).target_is_sorted_by(other_value)
120+
# After sorting by extracted MIDS (1, 2, 9, 10), dates should be:
121+
# MIDS=1 (2020-01-01) -> MIDS=2 (should be 2020-01-02) -> MIDS=9 (should be 2020-01-09) -> MIDS=10 (should be 2020-01-10)
122+
# Actual dates: 2020-01-01, 2020-01-09, 2020-01-10, 2020-01-02
123+
# Only MIDS=1 is in correct chronological position
124+
assert result.equals(pd.Series([True, False, False, False]))
125+
126+
127+
@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
128+
def test_target_is_sorted_by_with_regex_multiple_groups(dataset_class):
129+
"""
130+
Test regex sorting with multiple USUBJID and MIDSTYPE groups.
131+
"""
132+
df = dataset_class.from_dict(
133+
{
134+
"USUBJID": ["001", "001", "001", "002", "002", "002"],
135+
"MIDSTYPE": ["A", "A", "A", "A", "A", "A"],
136+
"MIDS": ["M1", "M2", "M3", "M1", "M2", "M3"],
137+
"SMSTDTC": [
138+
"2020-01-01",
139+
"2020-01-02",
140+
"2020-01-03",
141+
"2020-02-01",
142+
"2020-02-02",
143+
"2020-02-03",
144+
],
145+
}
146+
)
147+
148+
other_value = {
149+
"target": "MIDS",
150+
"regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers
151+
"within": ["USUBJID", "MIDSTYPE"],
152+
"comparator": [
153+
{"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"}
154+
],
155+
}
156+
157+
result = DataframeType({"value": df}).target_is_sorted_by(other_value)
158+
assert result.equals(pd.Series([True, True, True, True, True, True]))
159+
160+
161+
@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
162+
def test_target_is_sorted_by_without_regex_still_works(dataset_class):
163+
"""
164+
Test that the operator still works without regex (backward compatibility).
165+
"""
166+
df = dataset_class.from_dict(
167+
{
168+
"USUBJID": ["001", "001", "001"],
169+
"SESEQ": [1, 2, 3],
170+
"SESTDTC": [
171+
"2020-01-01",
172+
"2020-01-02",
173+
"2020-01-03",
174+
],
175+
}
176+
)
177+
178+
other_value = {
179+
"target": "SESEQ",
180+
"within": "USUBJID",
181+
"comparator": [{"name": "SESTDTC", "sort_order": "ASC"}],
182+
}
183+
184+
result = DataframeType({"value": df}).target_is_sorted_by(other_value)
185+
assert result.equals(pd.Series([True, True, True]))

0 commit comments

Comments
 (0)