Skip to content

Commit 46ef2f2

Browse files
Change distinct sets to lists (#1721)
* Change distinct sets to lists * more distinct fixes and test fixes * add none list to null_flavors --------- Co-authored-by: Samuel Johnson <96841389+SFJohnson24@users.noreply.github.com>
1 parent 27edc13 commit 46ef2f2

4 files changed

Lines changed: 30 additions & 28 deletions

File tree

cdisc_rules_engine/constants/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
" SAS\\s{5}.{8}SASDATA .{16}\\s{24}.{16}(?P<modified_date>.{16})\\s{16}.{40}"
1515
)
1616

17-
NULL_FLAVORS = ["", None, {None}, [], {}, np.nan]
17+
NULL_FLAVORS = ["", None, {}, {None}, [], [None], np.nan]
1818

1919
KNOWN_REPORT_EXTENSIONS = [".json", ".xlsx", ".xls"]
2020

cdisc_rules_engine/operations/distinct.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def _execute_operation(self):
3535
data = result[self.params.target].unique()
3636
if len(data) > 0 and isinstance(data[0], bytes):
3737
data = data.astype(str)
38-
result = set(data)
38+
result = list(data)
3939
else:
4040
grouped = result.groupby(
4141
self.params.grouping, as_index=False, group_keys=False
@@ -52,7 +52,9 @@ def get_existing_column_names(group):
5252
),
5353
axis=1,
5454
)
55-
return pd.Series({operation_id: set(values.dropna().unique())})
55+
return pd.Series(
56+
{operation_id: list(values.dropna().sort_index().unique())}
57+
)
5658

5759
result = grouped.apply(get_existing_column_names).reset_index()
5860
elif isinstance(result.data, pd.DataFrame):
@@ -65,7 +67,7 @@ def get_existing_column_names(group):
6567
.unique()
6668
.rename({self.params.target: self.params.operation_id})
6769
)
68-
result = result.apply(set).to_frame().reset_index()
70+
result = result.apply(list).to_frame().reset_index()
6971
return result
7072

7173
def _get_referenced_datasets(self):
@@ -76,4 +78,4 @@ def _get_referenced_datasets(self):
7678
return referenced_datasets
7779

7880
def _unique_values_for_column(self, column):
79-
return pd.Series({self.params.operation_id: set(column.unique())})
81+
return pd.Series({self.params.operation_id: list(column.unique())})

tests/unit/test_operations/test_distinct.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616
[
1717
(
1818
PandasDataset.from_dict({"values": [11, 12, 12, 5, 18, 9]}),
19-
{5, 9, 11, 12, 18},
19+
[11, 12, 5, 18, 9],
2020
),
2121
(
2222
DaskDataset.from_dict({"values": [11, 12, 12, 5, 18, 9]}),
23-
{5, 9, 11, 12, 18},
23+
[11, 12, 5, 18, 9],
2424
),
2525
],
2626
)
@@ -44,14 +44,14 @@ def test_distinct(data, expected, operation_params: OperationParams):
4444
PandasDataset.from_dict(
4545
{"values": [11, 12, 12, 5, 18, 9], "patient": [1, 2, 2, 1, 2, 1]}
4646
),
47-
{1: {5, 9, 11}, 2: {12, 18}},
47+
{1: [11, 5, 9], 2: [12, 18]},
4848
None,
4949
),
5050
(
5151
DaskDataset.from_dict(
5252
{"values": [11, 12, 12, 5, 18, 9], "patient": [1, 2, 2, 1, 2, 1]}
5353
),
54-
{1: {5, 9, 11}, 2: {12, 18}},
54+
{1: [11, 5, 9], 2: [12, 18]},
5555
None,
5656
),
5757
(
@@ -62,7 +62,7 @@ def test_distinct(data, expected, operation_params: OperationParams):
6262
"subject": [1, 2, 2, 1, 2, 3],
6363
}
6464
),
65-
{1: {5, 9, 11}, 2: {12, 18}, 3: None},
65+
{1: [11, 5, 9], 2: [12, 18], 3: None},
6666
["subject"],
6767
),
6868
(
@@ -73,7 +73,7 @@ def test_distinct(data, expected, operation_params: OperationParams):
7373
"subject": [1, 2, 2, 1, 2, 3],
7474
}
7575
),
76-
{1: {5, 9, 11}, 2: {12, 18}, 3: None},
76+
{1: [11, 5, 9], 2: [12, 18], 3: None},
7777
["subject"],
7878
),
7979
],
@@ -110,7 +110,7 @@ def test_grouped_distinct(
110110
"scat": ["a", "a", "a", "a", "a", "b"],
111111
}
112112
),
113-
{1: {5, 11}, 2: {12}},
113+
{1: [11, 5], 2: [12]},
114114
None,
115115
{"cat": 1, "scat": "a"},
116116
),
@@ -123,7 +123,7 @@ def test_grouped_distinct(
123123
"scat": ["a", "a", "a", "a", "a", "b"],
124124
}
125125
),
126-
{1: {5, 11}, 2: {12}},
126+
{1: [11, 5], 2: [12]},
127127
None,
128128
{"cat": 1, "scat": "a"},
129129
),
@@ -137,7 +137,7 @@ def test_grouped_distinct(
137137
"subject": [1, 2, 2, 1, 2, 3],
138138
}
139139
),
140-
{1: {5, 11}, 2: {12}, 3: None},
140+
{1: [11, 5], 2: [12], 3: None},
141141
["subject"],
142142
{"cat": 1, "scat": "a"},
143143
),
@@ -151,7 +151,7 @@ def test_grouped_distinct(
151151
"subject": [1, 2, 2, 1, 2, 3],
152152
}
153153
),
154-
{1: {5, 11}, 2: {12}, 3: None},
154+
{1: [11, 5], 2: [12], 3: None},
155155
["subject"],
156156
{"cat": 1, "scat": "a"},
157157
),
@@ -195,7 +195,7 @@ def test_filtered_grouped_distinct(
195195
"LBCAT": ["CAT1", "CAT2"],
196196
}
197197
),
198-
{"LBTEST", "LBSEQ"},
198+
["LBTEST", "LBSEQ"],
199199
),
200200
(
201201
DaskDataset.from_dict(
@@ -211,7 +211,7 @@ def test_filtered_grouped_distinct(
211211
"LBCAT": ["CAT1", "CAT2"],
212212
}
213213
),
214-
{"LBTEST", "LBSEQ"},
214+
["LBTEST", "LBSEQ"],
215215
),
216216
],
217217
)
@@ -262,7 +262,7 @@ def mock_get_dataset(dataset_name, **kwargs):
262262
"LBCAT": ["CAT1", "CAT2"],
263263
}
264264
),
265-
{1: {"LBTEST", "LBSEQ"}, 2: {"LBTEST", "LBSEQ", "LBCAT"}},
265+
{1: ["LBTEST", "LBSEQ"], 2: ["LBTEST", "LBSEQ", "LBCAT"]},
266266
["subject"],
267267
),
268268
(
@@ -281,7 +281,7 @@ def mock_get_dataset(dataset_name, **kwargs):
281281
"LBCAT": ["CAT1", "CAT2"],
282282
}
283283
),
284-
{1: {"LBTEST", "LBSEQ"}, 2: {"LBTEST", "LBSEQ", "LBCAT"}},
284+
{1: ["LBTEST", "LBSEQ"], 2: ["LBTEST", "LBSEQ", "LBCAT"]},
285285
["subject"],
286286
),
287287
],

tests/unit/test_utilities/test_rule_processor.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ def test_perform_rule_operation(mock_data_service, dataset_implementation):
498498
assert result["$max_aestdy"][0] == df["AESTDY"].max()
499499
assert result["$min_aestdy"][0] == df["AESTDY"].min()
500500
assert result["$avg_aestdy"][0] == df["AESTDY"].mean()
501-
assert result["$unique_aestdy"].equals(pd.Series([{11, 12, 40, 59}] * len(df)))
501+
assert result["$unique_aestdy"].equals(pd.Series([[11, 12, 40, 59]] * len(df)))
502502

503503

504504
@pytest.mark.parametrize("dataset_implementation", [PandasDataset, DaskDataset])
@@ -603,22 +603,22 @@ def test_perform_rule_operation_with_grouping(
603603
200,
604604
],
605605
"$unique_aestdy": [
606-
{
606+
[
607607
10,
608608
40,
609-
},
610-
{
609+
],
610+
[
611611
11,
612612
59,
613-
},
614-
{
613+
],
614+
[
615615
10,
616616
40,
617-
},
618-
{
617+
],
618+
[
619619
11,
620620
59,
621-
},
621+
],
622622
],
623623
}
624624
)

0 commit comments

Comments
 (0)