Skip to content

Commit e13ed6a

Browse files
DmitryMKSFJohnson24github-actions
authored
Updating minus operator to be order-sensitive (#1694)
* Updating minus operator * Addressing comments, changing default value of oreder_insensitive to True * Fixed an error, as I have updated incorrect default value * Update merged schema files with markdown descriptions --------- Co-authored-by: Samuel Johnson <96841389+SFJohnson24@users.noreply.github.com> Co-authored-by: github-actions <github-actions@cdisc.org>
1 parent d408ae6 commit e13ed6a

8 files changed

Lines changed: 120 additions & 7 deletions

File tree

cdisc_rules_engine/models/operation_params.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class OperationParams:
5959
source: str = None
6060
target: str = None
6161
subtract: str = None
62+
order_insensitive: bool = True
6263
value_is_reference: bool = False
6364
namespace: str = None
6465
delimiter: str = None

cdisc_rules_engine/operations/minus.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def _normalize_to_list(val):
1717
return [val]
1818

1919

20-
def _set_difference_preserve_order(list_a: list, list_b: list) -> list:
20+
def _set_difference_order_insensitive(list_a: list, list_b: list) -> list:
2121
"""
2222
Compute set difference A \\ B (elements in A not in B).
2323
Preserves order from list_a.
@@ -26,6 +26,43 @@ def _set_difference_preserve_order(list_a: list, list_b: list) -> list:
2626
return [x for x in _normalize_to_list(list_a) if x not in set_b]
2727

2828

29+
def _set_difference_order_sensitive(list_a: list, list_b: list) -> list:
30+
"""
31+
Compute set difference A \\ B (elements in A not in B).
32+
Take into account order of elements
33+
Preserves order from list_a.
34+
"""
35+
result = []
36+
a_start_index = 0
37+
list_a_normalized = _normalize_to_list(list_a)
38+
for b_item in _normalize_to_list(list_b):
39+
# Check if b_item is in the remaining part of A
40+
set_a = set(list_a_normalized[a_start_index:])
41+
if b_item in set_a:
42+
match_found = False
43+
# Iterate through A starting from last matched index
44+
for i in range(a_start_index, len(list_a_normalized)):
45+
a_item = list_a_normalized[i]
46+
if a_item != b_item:
47+
if match_found:
48+
break
49+
else:
50+
result.append(a_item)
51+
else:
52+
# Move start index to next position after matched item
53+
a_start_index = i + 1
54+
# We have to continue checking for duplicates of b_item in A, so we don't break here
55+
match_found = True
56+
else:
57+
# If B item is not in A, ignore it since there is nothing to subtract from A
58+
continue
59+
60+
# Add any remaining items in A after last matched index
61+
result.extend(list_a_normalized[a_start_index:])
62+
63+
return result
64+
65+
2966
class Minus(BaseOperation):
3067
"""
3168
Operation that computes set difference: name minus subtract.
@@ -43,4 +80,7 @@ def _execute_operation(self):
4380
if not subtract_ref or subtract_ref not in self.evaluation_dataset.columns:
4481
return _normalize_to_list(list_a)
4582
list_b = self.evaluation_dataset[subtract_ref].iloc[0]
46-
return _set_difference_preserve_order(list_a, list_b)
83+
if self.params.order_insensitive:
84+
return _set_difference_order_insensitive(list_a, list_b)
85+
else:
86+
return _set_difference_order_sensitive(list_a, list_b)

cdisc_rules_engine/utilities/rule_processor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ def perform_rule_operations(
429429
term_pref_term=operation.get("term_pref_term"),
430430
term_value=operation.get("term_value"),
431431
value_is_reference=operation.get("value_is_reference", False),
432+
order_insensitive=operation.get("order_insensitive", True),
432433
)
433434
try:
434435
# execute operation

resources/schema/rule-merged/Operations.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@
267267
"properties": {
268268
"operator": {
269269
"const": "minus",
270-
"markdownDescription": "\nComputes set difference: elements in `name` that are not in `subtract`. Uses [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) semantics (A \u2216 B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.\n\n```yaml\nOperations:\n - id: $expected_variables\n operator: expected_variables\n - id: $dataset_variables\n operator: get_column_order_from_dataset\n - id: $expected_minus_dataset\n name: $expected_variables\n operator: minus\n subtract: $dataset_variables\n```\n"
270+
"markdownDescription": "\nComputes set difference: elements in `name` that are not in `subtract`. By default a standard [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) semantics (A \u2216 B) is applied. Optional `order_insensitive` property allows to have the element order to be taken into consideration and only those `name` elements are removed which follow the same order as in `subtract` . Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.\n\n```yaml\nOperations:\n - id: $expected_variables\n operator: expected_variables\n - id: $dataset_variables\n operator: get_column_order_from_dataset\n - id: $expected_minus_dataset\n name: $expected_variables\n operator: minus\n subtract: $dataset_variables\n order_insensitive: false\n```\n"
271271
}
272272
},
273273
"required": ["id", "operator", "name", "subtract"],
@@ -697,6 +697,9 @@
697697
"value_is_reference": {
698698
"type": "boolean"
699699
},
700+
"order_insensitive": {
701+
"type": "boolean"
702+
},
700703
"version": {
701704
"type": "string"
702705
}

resources/schema/rule/Operations.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,9 @@
644644
"value_is_reference": {
645645
"type": "boolean"
646646
},
647+
"order_insensitive": {
648+
"type": "boolean"
649+
},
647650
"version": {
648651
"type": "string"
649652
}

resources/schema/rule/Operations.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -779,7 +779,7 @@ Operations:
779779
780780
### minus
781781
782-
Computes set difference: elements in `name` that are not in `subtract`. Uses [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) semantics (A ∖ B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.
782+
Computes set difference: elements in `name` that are not in `subtract`. By default a standard [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) semantics (A ∖ B) is applied. Optional `order_insensitive` property allows to have the element order to be taken into consideration and only those `name` elements are removed which follow the same order as in `subtract` . Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.
783783

784784
```yaml
785785
Operations:
@@ -791,6 +791,7 @@ Operations:
791791
name: $expected_variables
792792
operator: minus
793793
subtract: $dataset_variables
794+
order_insensitive: false
794795
```
795796

796797
### label_referenced_variable_metadata

resources/schema/rule/check_parameter.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,10 @@ Reference to another operation result, used as the second operand in operations
277277
subtract: $dataset_variables
278278
```
279279

280+
### order_insensitive
281+
282+
Optional boolean parameter for the `minus` operator. When set to `true` or not specified, `minus` performs a standard [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) operation, ignoring the order of elements. When set to `false`, `minus` considers element order and only removes elements from `name` that follow the same order as in `subtract`.
283+
280284
### term_code
281285

282286
Terminology code value used in controlled terminology operations for code-based lookups.

tests/unit/test_operations/test_minus.py

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44
from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset
55

66
from cdisc_rules_engine.models.operation_params import OperationParams
7-
from cdisc_rules_engine.operations.minus import Minus, _set_difference_preserve_order
7+
from cdisc_rules_engine.operations.minus import (
8+
Minus,
9+
_set_difference_order_insensitive,
10+
_set_difference_order_sensitive,
11+
)
812
import pytest
913

1014

@@ -32,10 +36,24 @@ def minus_params(operation_params: OperationParams) -> OperationParams:
3236
(["a", "", "b"], [""], ["a", "b"]),
3337
(["a", "", "b"], ["c"], ["a", "", "b"]),
3438
([""], [""], []),
39+
(["A", "C", "D", "B"], ["B", "D"], ["A", "C", "D"]),
40+
(["A", "C", "D", "B"], ["A", "B", "D"], ["C", "D"]),
3541
],
3642
)
37-
def test_set_difference_preserve_order(list_a, list_b, expected):
38-
assert _set_difference_preserve_order(list_a, list_b) == expected
43+
def test_set_difference_order_sensitive(list_a, list_b, expected):
44+
assert _set_difference_order_sensitive(list_a, list_b) == expected
45+
46+
47+
@pytest.mark.parametrize(
48+
"list_a,list_b,expected",
49+
[
50+
(["c", "b", "a"], ["b"], ["c", "a"]),
51+
(["A", "C", "D", "B"], ["B", "D"], ["A", "C"]),
52+
(["A", "C", "D", "B"], ["A", "B", "D"], ["C"]),
53+
],
54+
)
55+
def test_set_difference_order_insensitive(list_a, list_b, expected):
56+
assert _set_difference_order_insensitive(list_a, list_b) == expected
3957

4058

4159
@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset])
@@ -58,6 +76,48 @@ def test_minus_operation(minus_params: OperationParams, dataset_type):
5876
assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"]
5977

6078

79+
@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset])
80+
def test_minus_operation_incorrect_order(minus_params: OperationParams, dataset_type):
81+
eval_dataset = dataset_type.from_dict(
82+
{
83+
"$expected_variables": [
84+
["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"],
85+
["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"],
86+
],
87+
"$dataset_variables": [
88+
["STUDYID", "AESEQ", "DOMAIN", "AETERM"],
89+
["STUDYID", "AESEQ", "DOMAIN", "AETERM"],
90+
],
91+
}
92+
)
93+
94+
minus_params.order_insensitive = False
95+
operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock())
96+
result = operation.execute()
97+
assert list(result[minus_params.operation_id].iloc[0]) == ["DOMAIN", "AEDECOD"]
98+
99+
100+
@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset])
101+
def test_minus_operation_order_insensitive(minus_params: OperationParams, dataset_type):
102+
eval_dataset = dataset_type.from_dict(
103+
{
104+
"$expected_variables": [
105+
["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"],
106+
["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"],
107+
],
108+
"$dataset_variables": [
109+
["STUDYID", "AESEQ", "DOMAIN", "AETERM"],
110+
["STUDYID", "AESEQ", "DOMAIN", "AETERM"],
111+
],
112+
}
113+
)
114+
115+
minus_params.order_insensitive = True
116+
operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock())
117+
result = operation.execute()
118+
assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"]
119+
120+
61121
@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset])
62122
def test_minus_empty_subtract_returns_all_of_name(
63123
minus_params: OperationParams, dataset_type

0 commit comments

Comments
 (0)