Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cdisc_rules_engine/check_operators/dataframe_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,7 @@ def equals_string_part(self, other_value):
column with a regex
"""
target = other_value.get("target")
type_insensitive = other_value.get("type_insensitive", False)
comparator = other_value.get("comparator")
regex = other_value.get("regex")
value_is_literal: bool = other_value.get("value_is_literal", False)
Expand All @@ -821,7 +822,11 @@ def equals_string_part(self, other_value):
self.value[parsed_id] = parsed_data
return self.value.apply(
lambda row: self._check_equality(
row, target, parsed_id, value_is_literal=False
row,
target,
parsed_id,
value_is_literal=False,
type_insensitive=type_insensitive,
),
axis=1,
)
Expand Down
10 changes: 8 additions & 2 deletions resources/schema/rule-merged/Operator.json
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,10 @@
"properties": {
"operator": {
"const": "does_not_equal_string_part",
"markdownDescription": "\nComplement of `equals_string_part`\n"
"markdownDescription": "\nComplement of `equals_string_part`. Also has the optional parameter 'type_insensitive'.\n"
},
"type_insensitive": {
"type": "boolean"
}
},
"required": ["operator", "value", "regex"],
Expand Down Expand Up @@ -223,7 +226,10 @@
"properties": {
"operator": {
"const": "equals_string_part",
"markdownDescription": "\nChecks that the values in the target column equal the result of parsing the value in the comparison column with a regex\n\n> RDOMAIN equals characters 5 and 6 of SUPP dataset name\n\n```yaml\n- name: RDOMAIN\n operator: equals_string_part\n value: dataset_name\n regex: \".{4}(..).*\"\n```\n"
"markdownDescription": "\nChecks that the values in the target column equal the result of parsing the value in the comparison column with a regex\nHas optional parameter:\n\n- 'type_insensitive' when true, both values are converted to strings before comparison to handle type mismatches between string and numeric data. NOTE: all trailing zeroes will be removed in both strings and floats.\n\n> RDOMAIN equals characters 5 and 6 of SUPP dataset name\n\n```yaml\n- name: RDOMAIN\n operator: equals_string_part\n type_insensitive: true\n value: dataset_name\n regex: \".{4}(..).*\"\n```\n"
},
"type_insensitive": {
"type": "boolean"
}
},
"required": ["operator", "value", "regex"],
Expand Down
10 changes: 8 additions & 2 deletions resources/schema/rule/Operator.json
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@
"type": "object"
},
{
"properties": { "operator": { "const": "does_not_equal_string_part" } },
"properties": {
"operator": { "const": "does_not_equal_string_part" },
"type_insensitive": { "type": "boolean" }
},
"required": ["operator", "value", "regex"],
"type": "object"
},
Expand Down Expand Up @@ -120,7 +123,10 @@
"type": "object"
},
{
"properties": { "operator": { "const": "equals_string_part" } },
"properties": {
"operator": { "const": "equals_string_part" },
"type_insensitive": { "type": "boolean" }
},
"required": ["operator", "value", "regex"],
"type": "object"
},
Expand Down
6 changes: 5 additions & 1 deletion resources/schema/rule/Operator.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,17 +156,21 @@ Text-based operations including regex pattern matching, substring operations, pr

### does_not_equal_string_part

Complement of `equals_string_part`
Complement of `equals_string_part`. Also has the optional parameter 'type_insensitive'.

### equals_string_part

Checks that the values in the target column equal the result of parsing the value in the comparison column with a regex
Has optional parameter:

- 'type_insensitive' when true, both values are converted to strings before comparison to handle type mismatches between string and numeric data. NOTE: all trailing zeroes will be removed in both strings and floats.
Comment thread
aniemes marked this conversation as resolved.

> RDOMAIN equals characters 5 and 6 of SUPP dataset name

```yaml
- name: RDOMAIN
operator: equals_string_part
type_insensitive: true
value: dataset_name
regex: ".{4}(..).*"
```
Expand Down
119 changes: 119 additions & 0 deletions tests/unit/test_check_operators/test_string_comparison.py

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR looks great to me. I will recommend minor updates to the tests for covering the edge cases of

  1. No regex match
  2. Empty string/null target
  3. Case for deos_not_equal_string_part False
  4. Test for leading zero behavior.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Ramil,
Thank you for reviewing it! I will work on the changes now to cover the edge cases and update the Operator.md file as you said above.
Thanks!
Adam

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@RamilCDISC
So I am having some trouble with the 'no regex' as this seems to be expected. I added this into a new test case:
if operator == "equals_string_part": result = dataframe_type.equals_string_part( { "target": "target", "comparator": comparator, "regex": "(.*)", "type_insensitive": True, } )

But this is still using regex. Can you suggest how to test this without using regex? I am also have trouble with the empty string/null targets.

I tried this:
( {"VAR2": ["", ""], "target": [0, 0]}, "VAR2", "equals_string_part", PandasDataset, [True, True], )
And an empty target array but neither worked.

Pasted test file for reference.

test_string_comparison.py

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My intention with 'no regex match' was to ask for a test where the regex does not return any match, not to test without a regex. I see that my comment was not very clear. The edge cases I requested is to have a case where the passed regex does not return any match. Passing a regex is required.

For the null/empty target case, your example has empty values in VAR2 not in the target. target is '[0,0]'. I would test it with explicit empty/null values in target columns.

Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,20 @@
DaskDataset,
[True, True, False],
),
(
{"VAR2": ["<40"], "target": [40]},
"VAR2",
".(.*)",
PandasDataset,
[False],
),
(
{"VAR2": ["<40"], "target": [40]},
"VAR2",
".(.*)",
DaskDataset,
[False],
),
],
)
def test_equals_string_part(data, comparator, regex, dataset_type, expected_result):
Expand All @@ -33,6 +47,111 @@ def test_equals_string_part(data, comparator, regex, dataset_type, expected_resu
assert result.equals(df.convert_to_series(expected_result))


@pytest.mark.parametrize(
"data,comparator,operator,regex,dataset_type,expected_result",
[
(
{"VAR2": [">=40", "<=50"], "target": [40, 50]},
"VAR2",
"equals_string_part",
".{2}(.*)",
PandasDataset,
[True, True],
),
(
{"VAR2": [">=40", "<=50"], "target": [40, 50]},
"VAR2",
"equals_string_part",
".{2}(.*)",
DaskDataset,
[True, True],
),
(
{"VAR2": [">=42 ", "<=55 "], "target": [40, 50]},
"VAR2",
"does_not_equal_string_part",
".{2}(.*)",
PandasDataset,
[True, True],
),
(
{"VAR2": [">=42 ", "<=55 "], "target": [40, 50]},
"VAR2",
"does_not_equal_string_part",
".{2}(.*)",
DaskDataset,
[True, True],
),
(
{"VAR2": [">40", "<50"], "target": [40, 50]},
"VAR2",
"equals_string_part",
".(.*)",
DaskDataset,
[True, True],
),
(
{"VAR2": [">45", "<52"], "target": [40, 50]},
"VAR2",
"does_not_equal_string_part",
".(.*)",
PandasDataset,
[True, True],
),
(
{"VAR2": [">45", "<52"], "target": [40, 50]},
"VAR2",
"does_not_equal_string_part",
".(.*)",
DaskDataset,
[True, True],
),
(
{"VAR2": [">40", "<50"], "target": [40.0, 50.0]},
"VAR2",
"equals_string_part",
".(.*)",
PandasDataset,
[True, True],
),
(
{"VAR2": [">45", "<52"], "target": [40.0, 50.0]},
"VAR2",
"does_not_equal_string_part",
".(.*)",
DaskDataset,
[True, True],
),
],
)
def test_equals_string_part_type_insensitive(
data, comparator, operator, regex, dataset_type, expected_result
):
df = dataset_type.from_dict(data)
dataframe_type = DataframeType({"value": df})

if operator == "equals_string_part":
result = dataframe_type.equals_string_part(
{
"target": "target",
"comparator": comparator,
"regex": regex,
"type_insensitive": True,
}
)
else:
result = dataframe_type.does_not_equal_string_part(
{
"target": "target",
"comparator": comparator,
"regex": regex,
"type_insensitive": True,
}
)

assert result.equals(df.convert_to_series(expected_result))


@pytest.mark.parametrize(
"data,comparator,regex,dataset_type,value_is_literal,expected_result",
[
Expand Down
Loading